insoftai
diff --git a/Diff for: ‎3-feature-pipeline/llm/__init__.py b/Diff for: ‎3-feature-pipeline/llm/__init__.py
diff --git a/Diff for: ‎3-feature-pipeline/llm/chain.py
+10 b/Diff for: ‎3-feature-pipeline/llm/chain.py
+10
diff --git a/Diff for: ‎3-feature-pipeline/llm/prompt_templates.py
+69 b/Diff for: ‎3-feature-pipeline/llm/prompt_templates.py
+69
diff --git a/Diff for: ‎3-feature-pipeline/models/__init__.py b/Diff for: ‎3-feature-pipeline/models/__init__.py
diff --git a/Diff for: ‎3-feature-pipeline/models/base.py
+25 b/Diff for: ‎3-feature-pipeline/models/base.py
+25
diff --git a/Diff for: ‎3-feature-pipeline/models/chunk.py
+33 b/Diff for: ‎3-feature-pipeline/models/chunk.py
+33
diff --git a/Diff for: ‎3-feature-pipeline/models/clean.py
+63 b/Diff for: ‎3-feature-pipeline/models/clean.py
+63
diff --git a/Diff for: ‎3-feature-pipeline/models/embedded_chunk.py
+81 b/Diff for: ‎3-feature-pipeline/models/embedded_chunk.py
+81
diff --git a/Diff for: ‎3-feature-pipeline/models/raw.py
+24 b/Diff for: ‎3-feature-pipeline/models/raw.py
+24
diff --git a/Diff for: ‎3-feature-pipeline/rag/__init__.py b/Diff for: ‎3-feature-pipeline/rag/__init__.py
diff --git a/Diff for: ‎3-feature-pipeline/rag/query_expanison.py
+27 b/Diff for: ‎3-feature-pipeline/rag/query_expanison.py
+27
diff --git a/Diff for: ‎3-feature-pipeline/rag/reranking.py
+35 b/Diff for: ‎3-feature-pipeline/rag/reranking.py
+35
@@ -0,0 +1,10 @@
+from langchain.chains.llm import LLMChain
+from langchain.prompts import PromptTemplate
+
+
+class GeneralChain:
+    @staticmethod
+    def get_chain(llm, template: PromptTemplate, output_key: str, verbose=True):
+        return LLMChain(
+            llm=llm, prompt=template, output_key=output_key, verbose=verbose
+        )
@@ -0,0 +1,69 @@
+from abc import ABC, abstractmethod
+
+from langchain.prompts import PromptTemplate
+from pydantic import BaseModel
+
+
+class BasePromptTemplate(ABC, BaseModel):
+    @abstractmethod
+    def create_template(self) -> PromptTemplate:
+        pass
+
+
+class QueryExpansionTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to generate {to_expand_to_n}
+    different versions of the given user question to retrieve relevant documents from a vector
+    database. By generating multiple perspectives on the user question, your goal is to help
+    the user overcome some of the limitations of the distance-based similarity search.
+    Provide these alternative questions seperated by '{separator}'.
+    Original question: {question}"""
+
+    @property
+    def separator(self) -> str:
+        return "#next-question#"
+
+    def create_template(self, to_expand_to_n: int) -> PromptTemplate:
+        return PromptTemplate(
+            template=self.prompt,
+            input_variables=["question"],
+            partial_variables={
+                "separator": self.separator,
+                "to_expand_to_n": to_expand_to_n,
+            },
+        )
+
+
+class SelfQueryTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
+    The required information that needs to be extracted is the user or author id. 
+    Your response should consists of only the extracted id (e.g. 1345256), nothing else.
+    If you cannot find the author id, return the string "None".
+    User question: {question}"""
+
+    def create_template(self) -> PromptTemplate:
+        return PromptTemplate(template=self.prompt, input_variables=["question"])
+
+
+class RerankingTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to rerank passages related to a query
+    based on their relevance. 
+    The most relevant passages should be put at the beginning. 
+    You should only pick at max {keep_top_k} passages.
+    The provided and reranked documents are separated by '{separator}'.
+    
+    The following are passages related to this query: {question}.
+    
+    Passages: 
+    {passages}
+    """
+
+    def create_template(self, keep_top_k: int) -> PromptTemplate:
+        return PromptTemplate(
+            template=self.prompt,
+            input_variables=["question", "passages"],
+            partial_variables={"keep_top_k": keep_top_k, "separator": self.separator},
+        )
+
+    @property
+    def separator(self) -> str:
+        return "\n#next-document#\n"
@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+
+class DataModel(BaseModel):
+    """
+    Abstract class for all data models
+    """
+
+    entry_id: str
+    type: str
+
+
+class VectorDBDataModel(ABC, DataModel):
+    """
+    Abstract class for all data models that need to be saved into a vector DB (e.g. Qdrant)
+    """
+
+    entry_id: int
+    type: str
+
+    @abstractmethod
+    def to_payload(self) -> tuple:
+        pass
@@ -0,0 +1,33 @@
+from typing import Optional
+
+from models.base import DataModel
+
+
+class PostChunkModel(DataModel):
+    entry_id: str
+    platform: str
+    chunk_id: str
+    chunk_content: str
+    author_id: str
+    image: Optional[str] = None
+    type: str
+
+
+class ArticleChunkModel(DataModel):
+    entry_id: str
+    platform: str
+    link: str
+    chunk_id: str
+    chunk_content: str
+    author_id: str
+    type: str
+
+
+class RepositoryChunkModel(DataModel):
+    entry_id: str
+    name: str
+    link: str
+    chunk_id: str
+    chunk_content: str
+    owner_id: str
+    type: str
@@ -0,0 +1,63 @@
+from typing import Optional, Tuple
+
+from models.base import VectorDBDataModel
+
+
+class PostCleanedModel(VectorDBDataModel):
+    entry_id: str
+    platform: str
+    cleaned_content: str
+    author_id: str
+    image: Optional[str] = None
+    type: str
+
+    def to_payload(self) -> Tuple[str, dict]:
+        data = {
+            "platform": self.platform,
+            "author_id": self.author_id,
+            "cleaned_content": self.cleaned_content,
+            "image": self.image,
+            "type": self.type,
+        }
+
+        return self.entry_id, data
+
+
+class ArticleCleanedModel(VectorDBDataModel):
+    entry_id: str
+    platform: str
+    link: str
+    cleaned_content: str
+    author_id: str
+    type: str
+
+    def to_payload(self) -> Tuple[str, dict]:
+        data = {
+            "platform": self.platform,
+            "link": self.link,
+            "cleaned_content": self.cleaned_content,
+            "author_id": self.author_id,
+            "type": self.type,
+        }
+
+        return self.entry_id, data
+
+
+class RepositoryCleanedModel(VectorDBDataModel):
+    entry_id: str
+    name: str
+    link: str
+    cleaned_content: str
+    owner_id: str
+    type: str
+
+    def to_payload(self) -> Tuple[str, dict]:
+        data = {
+            "name": self.name,
+            "link": self.link,
+            "cleaned_content": self.cleaned_content,
+            "owner_id": self.owner_id,
+            "type": self.type,
+        }
+
+        return self.entry_id, data
@@ -0,0 +1,81 @@
+from typing import Tuple
+
+import numpy as np
+
+from models.base import VectorDBDataModel
+
+
+class PostEmbeddedChunkModel(VectorDBDataModel):
+    entry_id: str
+    platform: str
+    chunk_id: str
+    chunk_content: str
+    embedded_content: np.ndarray
+    author_id: str
+    type: str
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def to_payload(self) -> Tuple[str, np.ndarray, dict]:
+        data = {
+            "id": self.entry_id,
+            "platform": self.platform,
+            "content": self.chunk_content,
+            "owner_id": self.author_id,
+            "type": self.type,
+        }
+
+        return self.chunk_id, self.embedded_content, data
+
+
+class ArticleEmbeddedChunkModel(VectorDBDataModel):
+    entry_id: str
+    platform: str
+    link: str
+    chunk_id: str
+    chunk_content: str
+    embedded_content: np.ndarray
+    author_id: str
+    type: str
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def to_payload(self) -> Tuple[str, np.ndarray, dict]:
+        data = {
+            "id": self.entry_id,
+            "platform": self.platform,
+            "content": self.chunk_content,
+            "link": self.link,
+            "author_id": self.author_id,
+            "type": self.type,
+        }
+
+        return self.chunk_id, self.embedded_content, data
+
+
+class RepositoryEmbeddedChunkModel(VectorDBDataModel):
+    entry_id: str
+    name: str
+    link: str
+    chunk_id: str
+    chunk_content: str
+    embedded_content: np.ndarray
+    owner_id: str
+    type: str
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def to_payload(self) -> Tuple[str, np.ndarray, dict]:
+        data = {
+            "id": self.entry_id,
+            "name": self.name,
+            "content": self.chunk_content,
+            "link": self.link,
+            "owner_id": self.owner_id,
+            "type": self.type,
+        }
+
+        return self.chunk_id, self.embedded_content, data
@@ -0,0 +1,24 @@
+from typing import Optional
+
+from models.base import DataModel
+
+
+class RepositoryRawModel(DataModel):
+    name: str
+    link: str
+    content: dict
+    owner_id: str
+
+
+class ArticleRawModel(DataModel):
+    platform: str
+    link: str
+    content: dict
+    author_id: str
+
+
+class PostsRawModel(DataModel):
+    platform: str
+    content: dict
+    author_id: str | None = None
+    image: Optional[str] = None
@@ -0,0 +1,27 @@
+from langchain_openai import ChatOpenAI
+
+from llm.chain import GeneralChain
+from llm.prompt_templates import QueryExpansionTemplate
+from config import settings
+
+
+class QueryExpansion:
+    @staticmethod
+    def generate_response(query: str, to_expand_to_n: int) -> list[str]:
+        query_expansion_template = QueryExpansionTemplate()
+        prompt_template = query_expansion_template.create_template(to_expand_to_n)
+        model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)
+
+        chain = GeneralChain().get_chain(
+            llm=model, output_key="expanded_queries", template=prompt_template
+        )
+
+        response = chain.invoke({"question": query})
+        result = response["expanded_queries"]
+
+        queries = result.strip().split(query_expansion_template.separator)
+        stripped_queries = [
+            stripped_item for item in queries if (stripped_item := item.strip())
+        ]
+
+        return stripped_queries
@@ -0,0 +1,35 @@
+from langchain_openai import ChatOpenAI
+
+from llm.chain import GeneralChain
+from llm.prompt_templates import RerankingTemplate
+from config import settings
+
+
+class Reranker:
+    @staticmethod
+    def generate_response(
+        query: str, passages: list[str], keep_top_k: int
+    ) -> list[str]:
+        reranking_template = RerankingTemplate()
+        prompt_template = reranking_template.create_template(keep_top_k=keep_top_k)
+
+        model = ChatOpenAI(model=settings.OPENAI_MODEL_ID)
+        chain = GeneralChain().get_chain(
+            llm=model, output_key="rerank", template=prompt_template
+        )
+
+        stripped_passages = [
+            stripped_item for item in passages if (stripped_item := item.strip())
+        ]
+        passages = reranking_template.separator.join(stripped_passages)
+        response = chain.invoke({"question": query, "passages": passages})
+
+        result = response["rerank"]
+        reranked_passages = result.strip().split(reranking_template.separator)
+        stripped_passages = [
+            stripped_item
+            for item in reranked_passages
+            if (stripped_item := item.strip())
+        ]
+
+        return stripped_passages