Skip to content

Commit 4f2efb9

Browse files
authored
fix(open_embedding): use embedding for openai in batches as well (#53)
* fix(open_embedding): use embedding for openai in batches as well * fix: test cases
1 parent 165d421 commit 4f2efb9

File tree

4 files changed

+16
-21
lines changed

4 files changed

+16
-21
lines changed

backend/app/processing/file_preprocessing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def process_segmentation(project_id: int, asset_id: int, asset_file_name: str):
4343

4444
vectorstore.add_docs(
4545
docs=docs,
46-
metadatas=metadatas
46+
metadatas=metadatas,
47+
batch_size=100
4748
)
4849

4950
project_repository.update_asset_content_status(
@@ -67,6 +68,7 @@ def preprocess_file(asset_id: int):
6768
# Get asset details from the database first
6869
with SessionLocal() as db:
6970
asset = project_repository.get_asset(db=db, asset_id=asset_id)
71+
7072
if asset is None:
7173
logger.error(f"Asset with id {asset_id} not found in the database")
7274
return

backend/app/processing/process_queue.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,4 +406,4 @@ def vectorize_extraction_process_step(project_id: int, process_step_id: int, fil
406406
]
407407

408408
# Add documents to vectorstore
409-
vectorstore.add_docs(docs=docs, metadatas=metadatas)
409+
vectorstore.add_docs(docs=docs, metadatas=metadatas, batch_size=100)

backend/app/vectorstore/chroma.py

+8-17
Original file line numberDiff line numberDiff line change
@@ -101,25 +101,16 @@ def add_docs(
101101
filename = metadatas[0].get('filename', 'unknown')
102102
logger.info(f"Adding {len(docs)} sentences to the vector store for file {filename}")
103103

104-
# If using OpenAI embeddings, add all documents at once
105-
if self.settings.use_openai_embeddings and self.settings.openai_api_key:
106-
logger.info("Using OpenAI embeddings")
104+
# Batching the document processing
105+
batch_size = batch_size or self._batch_size
106+
107+
for i in range(0, len(docs), batch_size):
108+
logger.info(f"Processing batch {i} to {i + batch_size}")
107109
self._docs_collection.add(
108-
documents=list(docs),
109-
metadatas=metadatas,
110-
ids=ids,
110+
documents=docs[i : i + batch_size],
111+
metadatas=metadatas[i : i + batch_size],
112+
ids=ids[i : i + batch_size],
111113
)
112-
else:
113-
logger.info("Using default embedding function")
114-
batch_size = batch_size or self._batch_size
115-
116-
for i in range(0, len(docs), batch_size):
117-
logger.info(f"Processing batch {i} to {i + batch_size}")
118-
self._docs_collection.add(
119-
documents=docs[i : i + batch_size],
120-
metadatas=metadatas[i : i + batch_size],
121-
ids=ids[i : i + batch_size],
122-
)
123114

124115
return list(ids)
125116

backend/tests/processing/test_process_queue.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ def test_vectorize_extraction_process_step_single_reference(mock_chroma_db):
215215
# Assertions
216216
mock_vectorstore.add_docs.assert_called_once_with(
217217
docs=expected_docs,
218-
metadatas=expected_metadatas
218+
metadatas=expected_metadatas,
219+
batch_size=100
219220
)
220221

221222
@patch('app.processing.process_queue.ChromaDB')
@@ -261,7 +262,8 @@ def test_vectorize_extraction_process_step_multiple_references_concatenation(moc
261262
# Assertions
262263
mock_vectorstore.add_docs.assert_called_once_with(
263264
docs=expected_docs,
264-
metadatas=expected_metadatas
265+
metadatas=expected_metadatas,
266+
batch_size=100
265267
)
266268

267269
@patch('app.processing.process_queue.ChromaDB') # Replace with the correct module path

0 commit comments

Comments
 (0)