Skip to content

Commit 190d971

Browse files
authored
feat(megaparse): add sdk (#3462)
What it does : Adds the MegaParse API call for parsing using the SDK
1 parent 1356d87 commit 190d971

File tree

6 files changed

+153
-97
lines changed

6 files changed

+153
-97
lines changed

core/pyproject.toml

+2-5
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,12 @@ dependencies = [
2222
"transformers[sentencepiece]>=4.44.2",
2323
"faiss-cpu>=1.8.0.post1",
2424
"rapidfuzz>=3.10.1",
25+
"megaparse-sdk>=0.1.2",
26+
"markupsafe>=2.1.5",
2527
]
2628
readme = "README.md"
2729
requires-python = ">= 3.11"
2830

29-
[project.optional-dependencies]
30-
all = [
31-
"unstructured[epub,docx,odt,doc,pptx,ppt,xlsx,md]>=0.15.5",
32-
"docx2txt>=0.8",
33-
]
3431
[build-system]
3532
requires = ["hatchling"]
3633
build-backend = "hatchling.build"

core/quivr_core/config.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,22 @@
44
from pydantic import BaseModel
55

66

7-
class PdfParser(str, Enum):
8-
LLAMA_PARSE = "llama_parse"
7+
class ParserType(str, Enum):
8+
"""Parser type enumeration."""
9+
910
UNSTRUCTURED = "unstructured"
11+
LLAMA_PARSER = "llama_parser"
1012
MEGAPARSE_VISION = "megaparse_vision"
1113

1214

15+
class StrategyEnum(str, Enum):
16+
"""Method to use for the conversion"""
17+
18+
FAST = "fast"
19+
AUTO = "auto"
20+
HI_RES = "hi_res"
21+
22+
1323
class MegaparseBaseConfig(BaseModel):
1424
@classmethod
1525
def from_yaml(cls, file_path: str):
@@ -22,6 +32,8 @@ def from_yaml(cls, file_path: str):
2232

2333

2434
class MegaparseConfig(MegaparseBaseConfig):
25-
strategy: str = "fast"
26-
llama_parse_api_key: str | None = None
27-
pdf_parser: PdfParser = PdfParser.UNSTRUCTURED
35+
method: ParserType = ParserType.UNSTRUCTURED
36+
strategy: StrategyEnum = StrategyEnum.AUTO
37+
check_table: bool = False
38+
parsing_instruction: str | None = None
39+
model_name: str = "gpt-4o"

core/quivr_core/processor/implementations/megaparse_processor.py

+38-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import logging
2+
import os
23

34
import tiktoken
45
from langchain_core.documents import Document
56
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
6-
from megaparse import MegaParse
7+
from megaparse_sdk import MegaParseSDK
78

89
from quivr_core.config import MegaparseConfig
910
from quivr_core.files.file import QuivrFile
@@ -29,7 +30,24 @@ class MegaparseProcessor(ProcessorBase):
2930
3031
"""
3132

32-
supported_extensions = [FileExtension.pdf]
33+
supported_extensions = [
34+
FileExtension.pdf,
35+
FileExtension.docx,
36+
FileExtension.doc,
37+
FileExtension.pptx,
38+
FileExtension.xls,
39+
FileExtension.xlsx,
40+
FileExtension.csv,
41+
FileExtension.epub,
42+
FileExtension.bib,
43+
FileExtension.odt,
44+
FileExtension.html,
45+
FileExtension.py,
46+
FileExtension.markdown,
47+
FileExtension.md,
48+
FileExtension.mdx,
49+
FileExtension.ipynb,
50+
]
3351

3452
def __init__(
3553
self,
@@ -56,9 +74,24 @@ def processor_metadata(self):
5674
}
5775

5876
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
59-
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
60-
document: Document = await mega_parse.aload()
61-
if len(document.page_content) > self.splitter_config.chunk_size:
77+
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
78+
megaparse = MegaParseSDK(api_key)
79+
logger.info(f"Uploading file {file.path} to MegaParse")
80+
data = {
81+
"method": self.megaparse_config.method,
82+
"strategy": self.megaparse_config.strategy,
83+
"check_table": self.megaparse_config.check_table,
84+
"parsing_instruction": self.megaparse_config.parsing_instruction,
85+
"model_name": self.megaparse_config.model_name,
86+
}
87+
response = await megaparse.file.upload(
88+
file_path=str(file.path),
89+
**data,
90+
)
91+
document = Document(
92+
page_content=response["result"],
93+
)
94+
if len(response) > self.splitter_config.chunk_size:
6295
docs = self.text_splitter.split_documents([document])
6396
for doc in docs:
6497
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}

core/quivr_core/processor/registry.py

+52-31
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import types
44
from dataclasses import dataclass, field
55
from heapq import heappop, heappush
6-
from typing import Type, TypeAlias
6+
from typing import List, Type, TypeAlias
77

88
from quivr_core.files.file import FileExtension
99

@@ -49,37 +49,41 @@ class ProcEntry:
4949

5050
def _append_proc_mapping(
5151
mapping: ProcMapping,
52-
file_ext: FileExtension | str,
52+
file_exts: List[FileExtension] | List[str],
5353
cls_mod: str,
5454
errtxt: str,
5555
priority: int | None,
5656
):
57-
if file_ext in mapping:
58-
try:
59-
prev_proc = heappop(mapping[file_ext])
60-
proc_entry = ProcEntry(
61-
priority=priority if priority is not None else prev_proc.priority - 1,
62-
cls_mod=cls_mod,
63-
err=errtxt,
64-
)
65-
# Push the previous processor back
66-
heappush(mapping[file_ext], prev_proc)
67-
heappush(mapping[file_ext], proc_entry)
68-
except IndexError:
57+
for file_ext in file_exts:
58+
if file_ext in mapping:
59+
try:
60+
prev_proc = heappop(mapping[file_ext])
61+
proc_entry = ProcEntry(
62+
priority=priority
63+
if priority is not None
64+
else prev_proc.priority - 1,
65+
cls_mod=cls_mod,
66+
err=errtxt,
67+
)
68+
# Push the previous processor back
69+
heappush(mapping[file_ext], prev_proc)
70+
heappush(mapping[file_ext], proc_entry)
71+
except IndexError:
72+
proc_entry = ProcEntry(
73+
priority=priority if priority is not None else _LOWEST_PRIORITY,
74+
cls_mod=cls_mod,
75+
err=errtxt,
76+
)
77+
heappush(mapping[file_ext], proc_entry)
78+
79+
else:
6980
proc_entry = ProcEntry(
7081
priority=priority if priority is not None else _LOWEST_PRIORITY,
7182
cls_mod=cls_mod,
7283
err=errtxt,
7384
)
74-
heappush(mapping[file_ext], proc_entry)
7585

76-
else:
77-
proc_entry = ProcEntry(
78-
priority=priority if priority is not None else _LOWEST_PRIORITY,
79-
cls_mod=cls_mod,
80-
err=errtxt,
81-
)
82-
mapping[file_ext] = [proc_entry]
86+
mapping[file_ext] = [proc_entry]
8387

8488

8589
def defaults_to_proc_entries(
@@ -109,21 +113,38 @@ def defaults_to_proc_entries(
109113
ext_str = ext.value if isinstance(ext, FileExtension) else ext
110114
_append_proc_mapping(
111115
mapping=base_processors,
112-
file_ext=ext,
116+
file_exts=[ext],
113117
cls_mod=f"quivr_core.processor.implementations.default.{processor_name}",
114118
errtxt=f"can't import {processor_name}. Please install quivr-core[{ext_str}] to access {processor_name}",
115119
priority=None,
116120
)
117121

118122
# TODO(@aminediro): Megaparse should register itself
119123
# Append Megaparse
120-
# _append_proc_mapping(
121-
# mapping=base_processors,
122-
# file_ext=FileExtension.pdf,
123-
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
124-
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
125-
# priority=None,
126-
# )
124+
_append_proc_mapping(
125+
mapping=base_processors,
126+
file_exts=[
127+
FileExtension.pdf,
128+
FileExtension.docx,
129+
FileExtension.doc,
130+
FileExtension.pptx,
131+
FileExtension.xls,
132+
FileExtension.xlsx,
133+
FileExtension.csv,
134+
FileExtension.epub,
135+
FileExtension.bib,
136+
FileExtension.odt,
137+
FileExtension.html,
138+
FileExtension.py,
139+
FileExtension.markdown,
140+
FileExtension.md,
141+
FileExtension.mdx,
142+
FileExtension.ipynb,
143+
],
144+
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
145+
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
146+
priority=None,
147+
)
127148
return base_processors
128149

129150

@@ -181,7 +202,7 @@ def register_processor(
181202
if all(proc_cls != proc.cls_mod for proc in known_processors[file_ext]):
182203
_append_proc_mapping(
183204
known_processors,
184-
file_ext=file_ext,
205+
file_exts=[file_ext],
185206
cls_mod=proc_cls,
186207
errtxt=errtxt
187208
or f"{proc_cls} import failed for processor of {file_ext}",

core/tests/processor/pdf/test_megaparse_pdf_processor.py

-51
This file was deleted.

examples/simple_question_megaparse.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import os
2+
3+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4+
from quivr_core import Brain
5+
from quivr_core.llm.llm_endpoint import LLMEndpoint
6+
from quivr_core.rag.entities.config import LLMEndpointConfig
7+
from rich.console import Console
8+
from rich.panel import Panel
9+
from rich.prompt import Prompt
10+
11+
if __name__ == "__main__":
12+
brain = Brain.from_files(
13+
name="test_brain",
14+
file_paths=["./tests/processor/docx/demo.docx"],
15+
llm=LLMEndpoint(
16+
llm_config=LLMEndpointConfig(model="gpt-4o"),
17+
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
18+
),
19+
)
20+
embedder = embeddings = OpenAIEmbeddings(
21+
model="text-embedding-3-large",
22+
)
23+
# Check brain info
24+
brain.print_info()
25+
26+
console = Console()
27+
console.print(Panel.fit("Ask your brain !", style="bold magenta"))
28+
29+
while True:
30+
# Get user input
31+
question = Prompt.ask("[bold cyan]Question[/bold cyan]")
32+
33+
# Check if user wants to exit
34+
if question.lower() == "exit":
35+
console.print(Panel("Goodbye!", style="bold yellow"))
36+
break
37+
38+
answer = brain.ask(question)
39+
# Print the answer with typing effect
40+
console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
41+
42+
console.print("-" * console.width)
43+
44+
brain.print_info()

0 commit comments

Comments
 (0)