@@ -210,45 +210,46 @@ def wrapper(*args, **kwargs):
210
210
@handle_exceptions
211
211
def extract_process (api_key , process , process_step , asset_content ):
212
212
pdf_content = ""
213
- vectorstore = ChromaDB (f"panda-etl-{ process .project_id } " , similarity_threshold = 3 )
214
- if (
215
- ("multiple_fields" not in process .details or not process .details ["multiple_fields" ])
216
- and asset_content .content
217
- and asset_content .content .get ("word_count" , 0 ) > 500
218
- ):
219
- for field in process .details ["fields" ]:
220
- relevant_docs = vectorstore .get_relevant_docs (
221
- field ["key" ],
222
- where = {
223
- "$and" : [
224
- {"asset_id" : process_step .asset .id },
225
- {"project_id" : process .project_id },
226
- ]
227
- },
228
- k = 5 ,
229
- )
230
-
231
- for index , metadata in enumerate (relevant_docs ["metadatas" ][0 ]):
232
- segment_data = [relevant_docs ["documents" ][0 ][index ]]
233
- if metadata .get ("previous_sentence_id" , - 1 ) != - 1 :
234
- prev_sentence = vectorstore .get_relevant_docs_by_id (
235
- ids = [metadata ["previous_sentence_id" ]]
236
- )
237
- if prev_sentence ["documents" ] and len (prev_sentence ["documents" ][0 ]) > 0 :
238
- segment_data = [prev_sentence ["documents" ][0 ]] + segment_data
239
- else :
240
- logger .warning ("Previous sentence document is empty." )
241
-
242
- if metadata .get ("next_sentence_id" , - 1 ) != - 1 :
243
- next_sentence = vectorstore .get_relevant_docs_by_id (
244
- ids = [metadata ["next_sentence_id" ]]
245
- )
246
- if next_sentence ["documents" ] and len (next_sentence ["documents" ][0 ]) > 0 :
247
- segment_data .append (next_sentence ["documents" ][0 ])
248
- else :
249
- logger .warning ("Next sentence document is empty." )
250
-
251
- pdf_content += "\n " + " " .join (segment_data )
213
+ # TODO - Disable Vector store pdf content fetching temporarily until fixed.
214
+ # vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
215
+ # if (
216
+ # ("multiple_fields" not in process.details or not process.details["multiple_fields"])
217
+ # and asset_content.content
218
+ # and asset_content.content.get("word_count", 0) > 500
219
+ # ):
220
+ # for field in process.details["fields"]:
221
+ # relevant_docs = vectorstore.get_relevant_docs(
222
+ # field["key"],
223
+ # where={
224
+ # "$and": [
225
+ # {"asset_id": process_step.asset.id},
226
+ # {"project_id": process.project_id},
227
+ # ]
228
+ # },
229
+ # k=5,
230
+ # )
231
+
232
+ # for index, metadata in enumerate(relevant_docs["metadatas"][0]):
233
+ # segment_data = [relevant_docs["documents"][0][index]]
234
+ # if metadata.get("previous_sentence_id", -1) != -1:
235
+ # prev_sentence = vectorstore.get_relevant_docs_by_id(
236
+ # ids=[metadata["previous_sentence_id"]]
237
+ # )
238
+ # if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
239
+ # segment_data = [prev_sentence["documents"][0]] + segment_data
240
+ # else:
241
+ # logger.warning("Previous sentence document is empty.")
242
+
243
+ # if metadata.get("next_sentence_id", -1) != -1:
244
+ # next_sentence = vectorstore.get_relevant_docs_by_id(
245
+ # ids=[metadata["next_sentence_id"]]
246
+ # )
247
+ # if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
248
+ # segment_data.append(next_sentence["documents"][0])
249
+ # else:
250
+ # logger.warning("Next sentence document is empty.")
251
+
252
+ # pdf_content += "\n" + " ".join(segment_data)
252
253
253
254
if not pdf_content :
254
255
pdf_content = (
0 commit comments