Skip to content

Commit 0ace6e6

Browse files
committed
fix(extraction): pass complete pdf content
1 parent dd1365b commit 0ace6e6

File tree

2 files changed

+40
-40
lines changed

2 files changed

+40
-40
lines changed

backend/app/processing/process_queue.py

+40-39
Original file line numberDiff line numberDiff line change
@@ -210,45 +210,46 @@ def wrapper(*args, **kwargs):
210210
@handle_exceptions
211211
def extract_process(api_key, process, process_step, asset_content):
212212
pdf_content = ""
213-
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
214-
if (
215-
("multiple_fields" not in process.details or not process.details["multiple_fields"])
216-
and asset_content.content
217-
and asset_content.content.get("word_count", 0) > 500
218-
):
219-
for field in process.details["fields"]:
220-
relevant_docs = vectorstore.get_relevant_docs(
221-
field["key"],
222-
where={
223-
"$and": [
224-
{"asset_id": process_step.asset.id},
225-
{"project_id": process.project_id},
226-
]
227-
},
228-
k=5,
229-
)
230-
231-
for index, metadata in enumerate(relevant_docs["metadatas"][0]):
232-
segment_data = [relevant_docs["documents"][0][index]]
233-
if metadata.get("previous_sentence_id", -1) != -1:
234-
prev_sentence = vectorstore.get_relevant_docs_by_id(
235-
ids=[metadata["previous_sentence_id"]]
236-
)
237-
if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
238-
segment_data = [prev_sentence["documents"][0]] + segment_data
239-
else:
240-
logger.warning("Previous sentence document is empty.")
241-
242-
if metadata.get("next_sentence_id", -1) != -1:
243-
next_sentence = vectorstore.get_relevant_docs_by_id(
244-
ids=[metadata["next_sentence_id"]]
245-
)
246-
if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
247-
segment_data.append(next_sentence["documents"][0])
248-
else:
249-
logger.warning("Next sentence document is empty.")
250-
251-
pdf_content += "\n" + " ".join(segment_data)
213+
# TODO - Disable Vector store pdf content fetching temporarily until fixed.
214+
# vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
215+
# if (
216+
# ("multiple_fields" not in process.details or not process.details["multiple_fields"])
217+
# and asset_content.content
218+
# and asset_content.content.get("word_count", 0) > 500
219+
# ):
220+
# for field in process.details["fields"]:
221+
# relevant_docs = vectorstore.get_relevant_docs(
222+
# field["key"],
223+
# where={
224+
# "$and": [
225+
# {"asset_id": process_step.asset.id},
226+
# {"project_id": process.project_id},
227+
# ]
228+
# },
229+
# k=5,
230+
# )
231+
232+
# for index, metadata in enumerate(relevant_docs["metadatas"][0]):
233+
# segment_data = [relevant_docs["documents"][0][index]]
234+
# if metadata.get("previous_sentence_id", -1) != -1:
235+
# prev_sentence = vectorstore.get_relevant_docs_by_id(
236+
# ids=[metadata["previous_sentence_id"]]
237+
# )
238+
# if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
239+
# segment_data = [prev_sentence["documents"][0]] + segment_data
240+
# else:
241+
# logger.warning("Previous sentence document is empty.")
242+
243+
# if metadata.get("next_sentence_id", -1) != -1:
244+
# next_sentence = vectorstore.get_relevant_docs_by_id(
245+
# ids=[metadata["next_sentence_id"]]
246+
# )
247+
# if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
248+
# segment_data.append(next_sentence["documents"][0])
249+
# else:
250+
# logger.warning("Next sentence document is empty.")
251+
252+
# pdf_content += "\n" + " ".join(segment_data)
252253

253254
if not pdf_content:
254255
pdf_content = (

backend/tests/processing/test_process_queue.py

-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def test_extract_process(mock_chroma, mock_extract_data):
6969
assert result["fields"] == [{"field1": "value1"}]
7070
assert result["context"] == [[{'name': 'ESG_Reporting_Assurance', 'sources': ['Assurance'], 'page_numbers': None}]]
7171
mock_extract_data.assert_called_once()
72-
mock_chroma_instance.get_relevant_docs.assert_called()
7372

7473
def test_update_process_step_status():
7574
mock_db = Mock()

0 commit comments

Comments
 (0)