Skip to content

Commit bc6d75d

Browse files
authored
feat: megaparse v54 (#3594)
1 parent 699b549 commit bc6d75d

File tree

7 files changed

+54
-27
lines changed

7 files changed

+54
-27
lines changed

core/.flake8

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[flake8]
2+
; Minimal configuration for Flake8 to work with Black.
3+
max-line-length = 100
4+
ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,E203

core/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies = [
2121
"faiss-cpu>=1.8.0.post1",
2222
"rapidfuzz>=3.10.1",
2323
"markupsafe>=2.1.5",
24-
"megaparse-sdk>=0.1.9",
24+
"megaparse-sdk>=0.1.11",
2525
"langchain-mistralai>=0.2.3",
2626
"fasttext-langdetect>=1.0.5",
2727
"langfuse>=2.57.0",

core/quivr_core/processor/implementations/default.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@
1818
)
1919
from langchain_community.document_loaders.base import BaseLoader
2020
from langchain_community.document_loaders.text import TextLoader
21-
from langchain_core.documents import Document
2221
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
2322

2423
from quivr_core.files.file import FileExtension, QuivrFile
25-
from quivr_core.processor.processor_base import ProcessorBase
24+
from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
2625
from quivr_core.processor.splitter import SplitterConfig
2726

2827
logger = logging.getLogger("quivr_core")
@@ -74,7 +73,7 @@ def processor_metadata(self) -> dict[str, Any]:
7473
"splitter": self.splitter_config.model_dump(),
7574
}
7675

77-
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
76+
async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[None]:
7877
if hasattr(self.loader_cls, "__init__"):
7978
# NOTE: mypy can't correctly type this as BaseLoader doesn't have a constructor method
8079
loader = self.loader_cls(file_path=str(file.path), **self.loader_kwargs) # type: ignore
@@ -85,9 +84,12 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
8584
docs = self.text_splitter.split_documents(documents)
8685

8786
for doc in docs:
87+
# TODO: This metadata info should be typed
8888
doc.metadata = {"chunk_size": len(enc.encode(doc.page_content))}
8989

90-
return docs
90+
return ProcessedDocument(
91+
chunks=docs, processor_cls=cls_name, processor_response=None
92+
)
9193

9294
return type(cls_name, (ProcessorInit,), dict(_Processor.__dict__))
9395

core/quivr_core/processor/implementations/megaparse_processor.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,18 @@
55
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
66
from megaparse_sdk.client import MegaParseNATSClient
77
from megaparse_sdk.config import ClientNATSConfig
8+
from megaparse_sdk.schema.document import Document as MPDocument
89

910
from quivr_core.config import MegaparseConfig
1011
from quivr_core.files.file import QuivrFile
11-
from quivr_core.processor.processor_base import ProcessorBase
12+
from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
1213
from quivr_core.processor.registry import FileExtension
1314
from quivr_core.processor.splitter import SplitterConfig
1415

1516
logger = logging.getLogger("quivr_core")
1617

1718

18-
class MegaparseProcessor(ProcessorBase):
19+
class MegaparseProcessor(ProcessorBase[MPDocument]):
1920
"""
2021
Megaparse processor for PDF files.
2122
@@ -72,17 +73,22 @@ def processor_metadata(self):
7273
"chunk_overlap": self.splitter_config.chunk_overlap,
7374
}
7475

75-
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
76+
async def process_file_inner(
77+
self, file: QuivrFile
78+
) -> ProcessedDocument[MPDocument | str]:
7679
logger.info(f"Uploading file {file.path} to MegaParse")
7780
async with MegaParseNATSClient(ClientNATSConfig()) as client:
7881
response = await client.parse_file(file=file.path)
7982

80-
logger.info(f"File : {response}")
8183
document = Document(
82-
page_content=response,
84+
page_content=str(response),
8385
)
8486

85-
docs = self.text_splitter.split_documents([document])
86-
for doc in docs:
87-
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
88-
return docs
87+
chunks = self.text_splitter.split_documents([document])
88+
for chunk in chunks:
89+
chunk.metadata = {"chunk_size": len(self.enc.encode(chunk.page_content))}
90+
return ProcessedDocument(
91+
chunks=chunks,
92+
processor_cls="MegaparseProcessor",
93+
processor_response=response,
94+
)

core/quivr_core/processor/implementations/simple_txt_processor.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from langchain_core.documents import Document
55

66
from quivr_core.files.file import QuivrFile
7-
from quivr_core.processor.processor_base import ProcessorBase
7+
from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
88
from quivr_core.processor.registry import FileExtension
99
from quivr_core.processor.splitter import SplitterConfig
1010

@@ -47,7 +47,7 @@ def processor_metadata(self) -> dict[str, Any]:
4747
"splitter": self.splitter_config.model_dump(),
4848
}
4949

50-
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
50+
async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[str]:
5151
async with aiofiles.open(file.path, mode="r") as f:
5252
content = await f.read()
5353

@@ -57,4 +57,6 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
5757
doc, self.splitter_config.chunk_size, self.splitter_config.chunk_overlap
5858
)
5959

60-
return docs
60+
return ProcessedDocument(
61+
chunks=docs, processor_cls="SimpleTxtProcessor", processor_response=content
62+
)

core/quivr_core/processor/implementations/tika_processor.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
import tiktoken
21
import logging
32
import os
43
from typing import AsyncIterable
54

65
import httpx
6+
import tiktoken
77
from langchain_core.documents import Document
88
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
99

1010
from quivr_core.files.file import QuivrFile
11-
from quivr_core.processor.processor_base import ProcessorBase
11+
from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
1212
from quivr_core.processor.registry import FileExtension
1313
from quivr_core.processor.splitter import SplitterConfig
1414

@@ -70,12 +70,14 @@ def processor_metadata(self):
7070
"chunk_overlap": self.splitter_config.chunk_overlap,
7171
}
7272

73-
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
73+
async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[None]:
7474
async with file.open() as f:
7575
txt = await self._send_parse_tika(f)
7676
document = Document(page_content=txt)
7777
docs = self.text_splitter.split_documents([document])
7878
for doc in docs:
7979
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
8080

81-
return docs
81+
return ProcessedDocument(
82+
chunks=docs, processor_cls="TikaProcessor", processor_response=None
83+
)

core/quivr_core/processor/processor_base.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import logging
22
from abc import ABC, abstractmethod
33
from importlib.metadata import PackageNotFoundError, version
4-
from typing import Any
4+
from typing import Any, Generic, List, TypeVar
55

6+
from attr import dataclass
67
from langchain_core.documents import Document
78

89
from quivr_core.files.file import FileExtension, QuivrFile
@@ -11,13 +12,23 @@
1112
logger = logging.getLogger("quivr_core")
1213

1314

15+
R = TypeVar("R", covariant=True)
16+
17+
18+
@dataclass
19+
class ProcessedDocument(Generic[R]):
20+
chunks: List[Document]
21+
processor_cls: str
22+
processor_response: R
23+
24+
1425
# TODO: processors should be cached somewhere ?
1526
# The processor should be cached by processor type
1627
# The cache should use a single
17-
class ProcessorBase(ABC):
28+
class ProcessorBase(ABC, Generic[R]):
1829
supported_extensions: list[FileExtension | str]
1930

20-
def check_supported(self, file: QuivrFile):
31+
def check_supported(self, file: QuivrFile) -> None:
2132
if file.file_extension not in self.supported_extensions:
2233
raise ValueError(f"can't process a file of type {file.file_extension}")
2334

@@ -26,7 +37,7 @@ def check_supported(self, file: QuivrFile):
2637
def processor_metadata(self) -> dict[str, Any]:
2738
raise NotImplementedError
2839

29-
async def process_file(self, file: QuivrFile) -> list[Document]:
40+
async def process_file(self, file: QuivrFile) -> ProcessedDocument[R]:
3041
logger.debug(f"Processing file {file}")
3142
self.check_supported(file)
3243
docs = await self.process_file_inner(file)
@@ -35,7 +46,7 @@ async def process_file(self, file: QuivrFile) -> list[Document]:
3546
except PackageNotFoundError:
3647
qvr_version = "dev"
3748

38-
for idx, doc in enumerate(docs, start=1):
49+
for idx, doc in enumerate(docs.chunks, start=1):
3950
if "original_file_name" in doc.metadata:
4051
doc.page_content = f"Filename: {doc.metadata['original_file_name']} Content: {doc.page_content}"
4152
doc.page_content = doc.page_content.replace("\u0000", "")
@@ -56,5 +67,5 @@ async def process_file(self, file: QuivrFile) -> list[Document]:
5667
return docs
5768

5869
@abstractmethod
59-
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
70+
async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[R]:
6071
raise NotImplementedError

0 commit comments

Comments
 (0)