Skip to content

Commit 063bbd3

Browse files
authored
fix: added chunk_size in tika processor (#3466)
1 parent 190d971 commit 063bbd3

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

core/quivr_core/processor/implementations/tika_processor.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import tiktoken
12
import logging
23
import os
34
from typing import AsyncIterable
@@ -39,6 +40,7 @@ def __init__(
3940
self.max_retries = max_retries
4041
self._client = httpx.AsyncClient(timeout=timeout)
4142

43+
self.enc = tiktoken.get_encoding("cl100k_base")
4244
self.splitter_config = splitter_config
4345

4446
if splitter:
@@ -73,5 +75,7 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
7375
txt = await self._send_parse_tika(f)
7476
document = Document(page_content=txt)
7577
docs = self.text_splitter.split_documents([document])
78+
for doc in docs:
79+
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
7680

7781
return docs

0 commit comments

Comments
 (0)