-
Notifications
You must be signed in to change notification settings - Fork 74
/
Copy pathexample.py
53 lines (43 loc) · 1.84 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# good resources
# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
from pgvector.psycopg import register_vector, SparseVector
import psycopg
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
conn = psycopg.connect(dbname='pgvector_example', autocommit=True)
conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)
conn.execute('DROP TABLE IF EXISTS documents')
conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))')
model_id = 'opensearch-project/opensearch-neural-sparse-encoding-v1'
model = AutoModelForMaskedLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()]
def embed(input):
feature = tokenizer(
input,
padding=True,
truncation=True,
return_tensors='pt',
return_token_type_ids=False
)
output = model(**feature)[0]
values, _ = torch.max(output * feature['attention_mask'].unsqueeze(-1), dim=1)
values = torch.log(1 + torch.relu(values))
values[:, special_token_ids] = 0
return values.detach().cpu().numpy()
# note: works much better with longer content
input = [
'The dog is barking',
'The cat is purring',
'The bear is growling'
]
embeddings = embed(input)
for content, embedding in zip(input, embeddings):
conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector(embedding)))
query = 'forest'
query_embedding = embed([query])[0]
result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector(query_embedding),)).fetchall()
for row in result:
print(row[0])