Skip to content

Commit 4c97bf4

Browse files
committed
feat: fix vector issues in feature pipeline
1 parent 38e3ba5 commit 4c97bf4

20 files changed

+698
-0
lines changed

Diff for: 3-feature-pipeline/llm/__init__.py

Whitespace-only changes.

Diff for: 3-feature-pipeline/llm/chain.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from langchain.chains.llm import LLMChain
2+
from langchain.prompts import PromptTemplate
3+
4+
5+
class GeneralChain:
6+
@staticmethod
7+
def get_chain(llm, template: PromptTemplate, output_key: str, verbose=True):
8+
return LLMChain(
9+
llm=llm, prompt=template, output_key=output_key, verbose=verbose
10+
)

Diff for: 3-feature-pipeline/llm/prompt_templates.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from abc import ABC, abstractmethod
2+
3+
from langchain.prompts import PromptTemplate
4+
from pydantic import BaseModel
5+
6+
7+
class BasePromptTemplate(ABC, BaseModel):
8+
@abstractmethod
9+
def create_template(self) -> PromptTemplate:
10+
pass
11+
12+
13+
class QueryExpansionTemplate(BasePromptTemplate):
14+
prompt: str = """You are an AI language model assistant. Your task is to generate {to_expand_to_n}
15+
different versions of the given user question to retrieve relevant documents from a vector
16+
database. By generating multiple perspectives on the user question, your goal is to help
17+
the user overcome some of the limitations of the distance-based similarity search.
18+
Provide these alternative questions seperated by '{separator}'.
19+
Original question: {question}"""
20+
21+
@property
22+
def separator(self) -> str:
23+
return "#next-question#"
24+
25+
def create_template(self, to_expand_to_n: int) -> PromptTemplate:
26+
return PromptTemplate(
27+
template=self.prompt,
28+
input_variables=["question"],
29+
partial_variables={
30+
"separator": self.separator,
31+
"to_expand_to_n": to_expand_to_n,
32+
},
33+
)
34+
35+
36+
class SelfQueryTemplate(BasePromptTemplate):
37+
prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
38+
The required information that needs to be extracted is the user or author id.
39+
Your response should consists of only the extracted id (e.g. 1345256), nothing else.
40+
If you cannot find the author id, return the string "None".
41+
User question: {question}"""
42+
43+
def create_template(self) -> PromptTemplate:
44+
return PromptTemplate(template=self.prompt, input_variables=["question"])
45+
46+
47+
class RerankingTemplate(BasePromptTemplate):
48+
prompt: str = """You are an AI language model assistant. Your task is to rerank passages related to a query
49+
based on their relevance.
50+
The most relevant passages should be put at the beginning.
51+
You should only pick at max {keep_top_k} passages.
52+
The provided and reranked documents are separated by '{separator}'.
53+
54+
The following are passages related to this query: {question}.
55+
56+
Passages:
57+
{passages}
58+
"""
59+
60+
def create_template(self, keep_top_k: int) -> PromptTemplate:
61+
return PromptTemplate(
62+
template=self.prompt,
63+
input_variables=["question", "passages"],
64+
partial_variables={"keep_top_k": keep_top_k, "separator": self.separator},
65+
)
66+
67+
@property
68+
def separator(self) -> str:
69+
return "\n#next-document#\n"

Diff for: 3-feature-pipeline/models/__init__.py

Whitespace-only changes.

Diff for: 3-feature-pipeline/models/base.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from abc import ABC, abstractmethod
2+
3+
from pydantic import BaseModel
4+
5+
6+
class DataModel(BaseModel):
7+
"""
8+
Abstract class for all data models
9+
"""
10+
11+
entry_id: str
12+
type: str
13+
14+
15+
class VectorDBDataModel(ABC, DataModel):
16+
"""
17+
Abstract class for all data models that need to be saved into a vector DB (e.g. Qdrant)
18+
"""
19+
20+
entry_id: int
21+
type: str
22+
23+
@abstractmethod
24+
def to_payload(self) -> tuple:
25+
pass

Diff for: 3-feature-pipeline/models/chunk.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from typing import Optional
2+
3+
from models.base import DataModel
4+
5+
6+
class PostChunkModel(DataModel):
7+
entry_id: str
8+
platform: str
9+
chunk_id: str
10+
chunk_content: str
11+
author_id: str
12+
image: Optional[str] = None
13+
type: str
14+
15+
16+
class ArticleChunkModel(DataModel):
17+
entry_id: str
18+
platform: str
19+
link: str
20+
chunk_id: str
21+
chunk_content: str
22+
author_id: str
23+
type: str
24+
25+
26+
class RepositoryChunkModel(DataModel):
27+
entry_id: str
28+
name: str
29+
link: str
30+
chunk_id: str
31+
chunk_content: str
32+
owner_id: str
33+
type: str

Diff for: 3-feature-pipeline/models/clean.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from typing import Optional, Tuple
2+
3+
from models.base import VectorDBDataModel
4+
5+
6+
class PostCleanedModel(VectorDBDataModel):
7+
entry_id: str
8+
platform: str
9+
cleaned_content: str
10+
author_id: str
11+
image: Optional[str] = None
12+
type: str
13+
14+
def to_payload(self) -> Tuple[str, dict]:
15+
data = {
16+
"platform": self.platform,
17+
"author_id": self.author_id,
18+
"cleaned_content": self.cleaned_content,
19+
"image": self.image,
20+
"type": self.type,
21+
}
22+
23+
return self.entry_id, data
24+
25+
26+
class ArticleCleanedModel(VectorDBDataModel):
27+
entry_id: str
28+
platform: str
29+
link: str
30+
cleaned_content: str
31+
author_id: str
32+
type: str
33+
34+
def to_payload(self) -> Tuple[str, dict]:
35+
data = {
36+
"platform": self.platform,
37+
"link": self.link,
38+
"cleaned_content": self.cleaned_content,
39+
"author_id": self.author_id,
40+
"type": self.type,
41+
}
42+
43+
return self.entry_id, data
44+
45+
46+
class RepositoryCleanedModel(VectorDBDataModel):
47+
entry_id: str
48+
name: str
49+
link: str
50+
cleaned_content: str
51+
owner_id: str
52+
type: str
53+
54+
def to_payload(self) -> Tuple[str, dict]:
55+
data = {
56+
"name": self.name,
57+
"link": self.link,
58+
"cleaned_content": self.cleaned_content,
59+
"owner_id": self.owner_id,
60+
"type": self.type,
61+
}
62+
63+
return self.entry_id, data

Diff for: 3-feature-pipeline/models/embedded_chunk.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from typing import Tuple
2+
3+
import numpy as np
4+
5+
from models.base import VectorDBDataModel
6+
7+
8+
class PostEmbeddedChunkModel(VectorDBDataModel):
9+
entry_id: str
10+
platform: str
11+
chunk_id: str
12+
chunk_content: str
13+
embedded_content: np.ndarray
14+
author_id: str
15+
type: str
16+
17+
class Config:
18+
arbitrary_types_allowed = True
19+
20+
def to_payload(self) -> Tuple[str, np.ndarray, dict]:
21+
data = {
22+
"id": self.entry_id,
23+
"platform": self.platform,
24+
"content": self.chunk_content,
25+
"owner_id": self.author_id,
26+
"type": self.type,
27+
}
28+
29+
return self.chunk_id, self.embedded_content, data
30+
31+
32+
class ArticleEmbeddedChunkModel(VectorDBDataModel):
33+
entry_id: str
34+
platform: str
35+
link: str
36+
chunk_id: str
37+
chunk_content: str
38+
embedded_content: np.ndarray
39+
author_id: str
40+
type: str
41+
42+
class Config:
43+
arbitrary_types_allowed = True
44+
45+
def to_payload(self) -> Tuple[str, np.ndarray, dict]:
46+
data = {
47+
"id": self.entry_id,
48+
"platform": self.platform,
49+
"content": self.chunk_content,
50+
"link": self.link,
51+
"author_id": self.author_id,
52+
"type": self.type,
53+
}
54+
55+
return self.chunk_id, self.embedded_content, data
56+
57+
58+
class RepositoryEmbeddedChunkModel(VectorDBDataModel):
59+
entry_id: str
60+
name: str
61+
link: str
62+
chunk_id: str
63+
chunk_content: str
64+
embedded_content: np.ndarray
65+
owner_id: str
66+
type: str
67+
68+
class Config:
69+
arbitrary_types_allowed = True
70+
71+
def to_payload(self) -> Tuple[str, np.ndarray, dict]:
72+
data = {
73+
"id": self.entry_id,
74+
"name": self.name,
75+
"content": self.chunk_content,
76+
"link": self.link,
77+
"owner_id": self.owner_id,
78+
"type": self.type,
79+
}
80+
81+
return self.chunk_id, self.embedded_content, data

Diff for: 3-feature-pipeline/models/raw.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from typing import Optional
2+
3+
from models.base import DataModel
4+
5+
6+
class RepositoryRawModel(DataModel):
7+
name: str
8+
link: str
9+
content: dict
10+
owner_id: str
11+
12+
13+
class ArticleRawModel(DataModel):
14+
platform: str
15+
link: str
16+
content: dict
17+
author_id: str
18+
19+
20+
class PostsRawModel(DataModel):
21+
platform: str
22+
content: dict
23+
author_id: str | None = None
24+
image: Optional[str] = None

Diff for: 3-feature-pipeline/rag/__init__.py

Whitespace-only changes.

Diff for: 3-feature-pipeline/rag/query_expanison.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from langchain_openai import ChatOpenAI
2+
3+
from llm.chain import GeneralChain
4+
from llm.prompt_templates import QueryExpansionTemplate
5+
from config import settings
6+
7+
8+
class QueryExpansion:
9+
@staticmethod
10+
def generate_response(query: str, to_expand_to_n: int) -> list[str]:
11+
query_expansion_template = QueryExpansionTemplate()
12+
prompt_template = query_expansion_template.create_template(to_expand_to_n)
13+
model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)
14+
15+
chain = GeneralChain().get_chain(
16+
llm=model, output_key="expanded_queries", template=prompt_template
17+
)
18+
19+
response = chain.invoke({"question": query})
20+
result = response["expanded_queries"]
21+
22+
queries = result.strip().split(query_expansion_template.separator)
23+
stripped_queries = [
24+
stripped_item for item in queries if (stripped_item := item.strip())
25+
]
26+
27+
return stripped_queries

Diff for: 3-feature-pipeline/rag/reranking.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from langchain_openai import ChatOpenAI
2+
3+
from llm.chain import GeneralChain
4+
from llm.prompt_templates import RerankingTemplate
5+
from config import settings
6+
7+
8+
class Reranker:
9+
@staticmethod
10+
def generate_response(
11+
query: str, passages: list[str], keep_top_k: int
12+
) -> list[str]:
13+
reranking_template = RerankingTemplate()
14+
prompt_template = reranking_template.create_template(keep_top_k=keep_top_k)
15+
16+
model = ChatOpenAI(model=settings.OPENAI_MODEL_ID)
17+
chain = GeneralChain().get_chain(
18+
llm=model, output_key="rerank", template=prompt_template
19+
)
20+
21+
stripped_passages = [
22+
stripped_item for item in passages if (stripped_item := item.strip())
23+
]
24+
passages = reranking_template.separator.join(stripped_passages)
25+
response = chain.invoke({"question": query, "passages": passages})
26+
27+
result = response["rerank"]
28+
reranked_passages = result.strip().split(reranking_template.separator)
29+
stripped_passages = [
30+
stripped_item
31+
for item in reranked_passages
32+
if (stripped_item := item.strip())
33+
]
34+
35+
return stripped_passages

0 commit comments

Comments
 (0)