|
1 | 1 | import logging
|
2 | 2 | import os
|
3 |
| -from typing import Union, Any |
| 3 | +from typing import Union |
4 | 4 | from urllib.parse import parse_qs, urlparse
|
5 | 5 |
|
6 | 6 | import tiktoken
|
|
10 | 10 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
|
11 | 11 | from pydantic import SecretStr
|
12 | 12 | import time
|
13 |
| -from pympler import asizeof |
14 | 13 |
|
15 | 14 | from quivr_core.brain.info import LLMInfo
|
16 | 15 | from quivr_core.rag.entities.config import DefaultModelSuppliers, LLMEndpointConfig
|
|
19 | 18 | logger = logging.getLogger("quivr_core")
|
20 | 19 |
|
21 | 20 |
|
22 |
| -def get_size(obj: Any, seen: set | None = None) -> int: |
23 |
| - return asizeof.asizeof(obj) |
24 |
| - |
25 |
| - |
26 | 21 | class LLMTokenizer:
|
27 | 22 | _cache: dict[
|
28 | 23 | int, tuple["LLMTokenizer", int, float]
|
29 | 24 | ] = {} # {hash: (tokenizer, size_bytes, last_access_time)}
|
30 | 25 | _max_cache_size_mb: int = 50
|
31 |
| - _max_cache_count: int = 3 # Default maximum number of cached tokenizers |
| 26 | + _max_cache_count: int = 5 # Default maximum number of cached tokenizers |
32 | 27 | _current_cache_size: int = 0
|
| 28 | + _default_size: int = 5 * 1024 * 1024 |
33 | 29 |
|
34 | 30 | def __init__(self, tokenizer_hub: str | None, fallback_tokenizer: str):
|
35 | 31 | self.tokenizer_hub = tokenizer_hub
|
@@ -63,7 +59,29 @@ def __init__(self, tokenizer_hub: str | None, fallback_tokenizer: str):
|
63 | 59 | self.tokenizer = tiktoken.get_encoding(self.fallback_tokenizer)
|
64 | 60 |
|
65 | 61 | # More accurate size estimation
|
66 |
| - self._size_bytes = get_size(self.tokenizer) |
| 62 | + self._size_bytes = self._calculate_tokenizer_size() |
| 63 | + |
| 64 | + def _calculate_tokenizer_size(self) -> int: |
| 65 | + """Calculate size of tokenizer by summing the sizes of its vocabulary and model files""" |
| 66 | + # By default, return a size of 5 MB |
| 67 | + if not hasattr(self.tokenizer, "vocab_files_names") or not hasattr( |
| 68 | + self.tokenizer, "init_kwargs" |
| 69 | + ): |
| 70 | + return self._default_size |
| 71 | + |
| 72 | + total_size = 0 |
| 73 | + |
| 74 | + # Get the file keys from vocab_files_names |
| 75 | + file_keys = self.tokenizer.vocab_files_names.keys() |
| 76 | + # Look up these files in init_kwargs |
| 77 | + for key in file_keys: |
| 78 | + if file_path := self.tokenizer.init_kwargs.get(key): |
| 79 | + try: |
| 80 | + total_size += os.path.getsize(file_path) |
| 81 | + except (OSError, FileNotFoundError): |
| 82 | + logger.debug(f"Could not access tokenizer file: {file_path}") |
| 83 | + |
| 84 | + return total_size if total_size > 0 else self._default_size |
67 | 85 |
|
68 | 86 | @classmethod
|
69 | 87 | def load(cls, tokenizer_hub: str, fallback_tokenizer: str):
|
|
0 commit comments