feat: adding cache to LLMEndpoint (#3555)

jacopo-chevallard · web-flow · commit 6072907ca737 · 2025-01-27T11:47:19.000+01:00
This allows us to avoid repeating expensive operations, such as
reloading the tokenizers, at each call

Closes ENT-394
diff --git a/core/quivr_core/llm/llm_endpoint.py b/core/quivr_core/llm/llm_endpoint.py
@@ -18,6 +18,8 @@
 
 
 class LLMEndpoint:
+    _cache: dict[int, "LLMEndpoint"] = {}
+
     def __init__(self, llm_config: LLMEndpointConfig, llm: BaseChatModel):
         self._config = llm_config
         self._llm = llm
@@ -55,6 +57,13 @@ def get_config(self):
 
     @classmethod
     def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()):
+        # Create a cache key from the config
+        cache_key = hash(str(config.model_dump()))
+
+        # Return cached instance if it exists
+        if cache_key in cls._cache:
+            return cls._cache[cache_key]
+
         _llm: Union[AzureChatOpenAI, ChatOpenAI, ChatAnthropic, ChatMistralAI]
         try:
             if config.supplier == DefaultModelSuppliers.AZURE:
@@ -112,7 +121,9 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()):
                     max_tokens=config.max_output_tokens,
                     temperature=config.temperature,
                 )
-            return cls(llm=_llm, llm_config=config)
+            instance = cls(llm=_llm, llm_config=config)
+            cls._cache[cache_key] = instance
+            return instance
 
         except ImportError as e:
             raise ImportError(