Skip to content

Commit c4ee884

Browse files
committed
further improvements
1 parent 920e868 commit c4ee884

18 files changed

+61
-56
lines changed

ch05/recommendation/collaborative_filtering/recommender.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from util.sparse_vector import cosine_similarity
1010

1111

12-
class BaseRecomander(object):
12+
class BaseRecommender(object):
1313
label = None
1414
property = None
1515
sparse_vector_query = None
@@ -109,7 +109,7 @@ def get_score(self, user_id: str, item_id: str) -> float:
109109
return result[0]
110110

111111

112-
class UserRecomander(BaseRecomander):
112+
class UserRecommender(BaseRecommender):
113113
label = "User"
114114
property = "userId"
115115
sparse_vector_query = """
@@ -127,7 +127,7 @@ class UserRecomander(BaseRecomander):
127127
"""
128128

129129

130-
class ItemRecomander(BaseRecomander):
130+
class ItemRecommender(BaseRecommender):
131131
label = "User"
132132
property = "userId"
133133
sparse_vector_query = """
@@ -149,9 +149,9 @@ class KNNType(Enum):
149149

150150
def __init__(self, uri: str, user: str, password: str):
151151
self._driver = GraphDatabase.driver(uri, auth=(user, password), encrypted=0)
152-
self.strategies: Dict[Recommender.KNNType, BaseRecomander] = {
153-
Recommender.KNNType.USER: UserRecomander(uri, user, password),
154-
Recommender.KNNType.ITEM: ItemRecomander(uri, user, password)
152+
self.strategies: Dict[Recommender.KNNType, BaseRecommender] = {
153+
Recommender.KNNType.USER: UserRecommender(uri, user, password),
154+
Recommender.KNNType.ITEM: ItemRecommender(uri, user, password)
155155
}
156156

157157
def compute_and_store_KNN(self, type_: KNNType) -> None:

ch06/imports/yoochoose/import_yoochoose.py

+2-18
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,8 @@ def import_session_data(self, file):
3939
MERGE (item:Item {itemId: $itemId, category: $category})
4040
CREATE (click:Click {timestamp: $timestamp})
4141
CREATE (session)-[:CONTAINS]->(click)
42-
CREATE (click)-[:RELATED_TO]->(item)
42+
CREATE (click)-[:IS_RELATED_TO]->(item)
4343
"""
44-
# CREATE (session)-[:LAST_CLICK]->(click)
45-
46-
# WITH click, session
47-
# MATCH (session)-[r:LAST_CLICK]->(lastClick:Click)
48-
# WHERE id(click) <> id(lastClick)
49-
# CREATE (lastClick)-[:NEXT]->(click)
50-
# DELETE r
5144

5245
for row in df.itertuples():
5346
try:
@@ -90,14 +83,8 @@ def import_buys_data(self, file):
9083
MATCH (item:Item {itemId: $itemId})
9184
CREATE (buy:Buy:Click {timestamp: $timestamp})
9285
CREATE (session)-[:CONTAINS]->(buy)
93-
CREATE (buy)-[:RELATED_TO]->(item)
86+
CREATE (buy)-[:IS_RELATED_TO]->(item)
9487
"""
95-
# CREATE (session)-[:LAST_CLICK]->(buy)
96-
# WITH buy, session
97-
# MATCH (session)-[r:LAST_CLICK]->(lastClick:Click)
98-
# WHERE id(buy) <> id(lastClick)
99-
# CREATE (lastClick)-[:NEXT]->(buy)
100-
# DELETE r
10188

10289
for row in df.itertuples():
10390
try:
@@ -117,9 +104,6 @@ def import_buys_data(self, file):
117104
tx.commit()
118105
print(j, "lines processed")
119106
print(j, "lines processed")
120-
# tx = session.begin_transaction()
121-
# query to set the last transaction
122-
# tx.commit
123107

124108

125109
def strip(string): return ''.join([c if 0 < ord(c) < 128 else ' ' for c in string])

ch06/imports/yoochoose/import_yoochoose_advanced.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def import_session_data(self, file):
7474
UNWIND $items as entry
7575
MERGE (item:Item {itemId: entry.itemId, category: entry.category})
7676
CREATE (click:Click {timestamp: entry.timestamp})
77-
CREATE (click)-[:RELATED_TO]->(item)
77+
CREATE (click)-[:IS_RELATED_TO]->(item)
7878
CREATE (session)-[:CONTAINS]->(click)
7979
"""
8080
for session_id in list(sess_clicks):
@@ -108,7 +108,7 @@ def import_buys_data(self, file, sess_clicks):
108108
MATCH (item:Item {itemId: $itemId})
109109
CREATE (buy:Buy:Click {timestamp: $timestamp})
110110
CREATE (session)-[:CONTAINS]->(buy)
111-
CREATE (buy)-[:RELATED_TO]->(item)
111+
CREATE (buy)-[:IS_RELATED_TO]->(item)
112112
"""
113113
for chunk in pd.read_csv(file,
114114
header=0,

ch06/recommendation/session_based_recommendation_iknn_approach.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get_item_vectors(self):
3939
"""
4040

4141
query = """
42-
MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
42+
MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
4343
WHERE item.itemId = $itemId
4444
WITH session
4545
ORDER BY id(session)
@@ -55,7 +55,7 @@ def get_item_vectors(self):
5555
i += 1
5656
if i % 100 == 0:
5757
print(i, "rows processed")
58-
print(i, "lines processed")
58+
print(i, "rows processed")
5959
print(len(items_VSM_sparse))
6060
return items_VSM_sparse
6161

ch06/recommendation/session_based_recommendation_iknn_approach_advanced.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def get_item_vectors(self):
8989
"""
9090

9191
query = """
92-
MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
92+
MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
9393
WHERE item.itemId = $itemId
9494
WITH session
9595
order by click.timestamp desc

ch06/recommendation/session_based_recommendation_sknn_approach.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get_session_vectors(self):
3939
"""
4040

4141
query = """
42-
MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
42+
MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
4343
WHERE session.sessionId = $sessionId
4444
WITH item
4545
ORDER BY id(item)
@@ -56,7 +56,7 @@ def get_session_vectors(self):
5656
if i % 100 == 0:
5757
print(i, "rows processed")
5858

59-
print(i, "lines processed")
59+
print(i, "rows processed")
6060
print(len(sessions_VSM_sparse))
6161
return sessions_VSM_sparse
6262

@@ -84,7 +84,7 @@ def store_knn(self, session_id, knn):
8484
def recommend_to(self, session_id, k):
8585
top_items = []
8686
query = """
87-
MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:RELATED_TO]->(item:Item)
87+
MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:IS_RELATED_TO]->(item:Item)
8888
WHERE target.sessionId = $sessionId
8989
WITH DISTINCT item.itemId as itemId, r
9090
RETURN itemId, sum(r.weight) as score

ch06/recommendation/session_based_recommendation_sknn_approach_advanced.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def get_session_vectors(self):
8787
"""
8888

8989
query = """
90-
MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
90+
MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
9191
WHERE session.sessionId = $sessionId
9292
WITH item
9393
order by click.timestamp desc
@@ -135,7 +135,7 @@ def store_knn(self, session_id, knn):
135135
def recommend_to(self, session_id, k):
136136
top_items = []
137137
query = """
138-
MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:RELATED_TO]->(item:Item)
138+
MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:IS_RELATED_TO]->(item:Item)
139139
WHERE target.sessionId = $sessionId
140140
WITH DISTINCT item.itemId as itemId, r
141141
RETURN itemId, sum(r.weight) as score

ch06/recommendation/similarity_item_example.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from sklearn.metrics.pairwise import cosine_similarity
22

3+
#Vector representation of the items
34
item3 = [0,1,0,0,1]
45
item7 = [1,0,1,0,1]
56
item9 = [0,1,1,1,1]
@@ -11,7 +12,7 @@
1112
item346 = [1,0,1,1,1]
1213
item562 = [1,0,0,0,0]
1314

14-
print("....")
15+
# Compute and print relevant similarities
1516
print(cosine_similarity([item12], [item23]))
1617
print(cosine_similarity([item12], [item65]))
1718
print(cosine_similarity([item12], [item85]))

ch07/imports/depaulmovie/README.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
We need to have a README for that piece of code – how to pass path to data, username/password, where to take data, etc.
2+
3+
4+
We need to give link right here, so people can click it & load the data. Reference section also doesn’t contain the link, so people will need to go to .
5+
I’ve found https://github.com/JDonini/DePaulMovie-Recommender-System, but not 100% sure that it’s correct

ch07/recommendations/context_aware_recommendation_approach.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def get_item_vectors(self, context):
7474
i += 1
7575
if i % 100 == 0:
7676
print(i, "rows processed")
77-
print(i, "lines processed")
77+
print(i, "rows processed")
7878
print(len(items_VSM_sparse))
7979
return items_VSM_sparse
8080

ch08/queries/simple_ring_fraud.cypher

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//simple model, run it one by one
2-
CREATE CONSTRAINT ON (s:Email) ASSERT (s.value) IS NODE KEY;
3-
CREATE CONSTRAINT ON (s:PhoneNumber) ASSERT (s.value) IS NODE KEY;
4-
CREATE CONSTRAINT ON (s:Address) ASSERT (s.value) IS NODE KEY;
2+
CREATE CONSTRAINT ON (s:Email) ASSERT (s.value) IS UNIQUE;
3+
CREATE CONSTRAINT ON (s:PhoneNumber) ASSERT (s.value) IS UNIQUE;
4+
CREATE CONSTRAINT ON (s:Address) ASSERT (s.value) IS UNIQUE;
55

66
//run it all at once
77
CREATE (alenegro:User {accountId: "49295987202", username: "alenegro", email: "[email protected]", name:"Hilda J Womack", phone_number: "580-548-1149", address: "4093 Cody Ridge Road - Enid, OK"})

ch08/queries/simple_ring_fraud_IP.cypher

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS NODE KEY;
1+
CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS UNIQUE;
22

33

44
//run it all at once

ch09/analysis/distance_based_analysis.py

+14-13
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,28 @@ def __init__(self, uri, user, password):
1313
def close(self):
1414
self._driver.close()
1515

16-
def compute_and_store_distances(self, k, exact):
16+
def compute_and_store_distances(self, k, exact, distance_function, relationship_name):
1717
start = time.time()
1818
data, data_labels = self.get_transaction_vectors()
1919
print("Time to get vectors:", time.time() - start)
2020
start = time.time()
21-
#selected_feature = np.loadtxt("array.txt")
22-
#new_data = [np.multiply(vector, selected_feature).tolist() for vector in data]
21+
2322
if exact:
24-
ann_labels, ann_distances = self.compute_knn(data, data_labels, k)
25-
label = "DISTANT_FROM_EXACT"
23+
ann_labels, ann_distances = self.compute_knn(data, data_labels, k, distance_function)
2624
else:
27-
ann_labels, ann_distances = self.compute_ann(data, data_labels, k)
28-
label = "DISTANT_FROM"
25+
ann_labels, ann_distances = self.compute_ann(data, data_labels, k, distance_function)
26+
2927
print("Time to compute nearest neighbors:", time.time() - start)
3028
start = time.time()
31-
self.store_ann(data_labels, ann_labels, ann_distances, label)
29+
self.store_ann(data_labels, ann_labels, ann_distances, relationship_name)
3230
print("Time to store nearest neighbors:", time.time() - start)
3331
print("done")
3432

35-
def compute_ann(self, data, data_labels, k):
33+
def compute_ann(self, data, data_labels, k, distance_function):
3634
dim = len(data[0])
3735
num_elements = len(data_labels)
3836
# Declaring index
39-
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
37+
p = hnswlib.Index(space=distance_function, dim=dim) # possible options for ditance_formula are l2, cosine or ip
4038
# Initing index - the maximum number of elements should be known beforehand
4139
p.init_index(max_elements=num_elements, ef_construction=800, M=200)
4240
# Element insertion (can be called several times):
@@ -47,9 +45,9 @@ def compute_ann(self, data, data_labels, k):
4745
labels, distances = p.knn_query(data, k = k)
4846
return labels, distances
4947

50-
def compute_knn(self, data, data_labels, k):
48+
def compute_knn(self, data, data_labels, k, distance_function):
5149
pre_processed_data = [np.array(item) for item in data]
52-
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='mahalanobis', n_jobs=-1).fit(pre_processed_data)
50+
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=distance_function, n_jobs=-1).fit(pre_processed_data)
5351
knn_distances, knn_labels = nbrs.kneighbors(pre_processed_data)
5452
distances = knn_distances
5553
labels = [[data_labels[element] for element in item] for item in knn_labels]
@@ -114,8 +112,11 @@ def store_ann(self, data_labels, ann_labels, ann_distances, label): #ADD the opp
114112

115113
if __name__ == '__main__':
116114
uri = "bolt://localhost:7687"
115+
distance_formula_value = "l2" #'mahalanobis' for exact
116+
#relationship_name_value = "DISTANT_FROM_EXACT"
117+
relationship_name_value = "DISTANT_FROM"
117118
analyzer = DistanceBasedAnalysis(uri=uri, user="neo4j", password="q1")
118-
analyzer.compute_and_store_distances(25, False);
119+
analyzer.compute_and_store_distances(25, False, distance_formula_value, relationship_name_value);
119120
# Uncomment this line to calculate exact NNs, but it will take a lot of time!
120121
# analyzer.compute_and_store_distances(25, True);
121122
analyzer.close()

ch10/analysis/DegreeAnalysis.xlsx

811 KB
Binary file not shown.

ch12/06_spacy_entity_relationship_extraction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def tokenize_and_store(self, text, text_id, storeTag):
4949
self.__text_processor.build_entities_inferred_graph(text_id)
5050
rules = [
5151
{
52-
'type': 'RECEIVE_PRICE',
52+
'type': 'RECEIVE_PRIZE',
5353
'verbs': ['receive'],
5454
'subjectTypes': ['PERSON', 'NP'],
5555
'objectTypes': ['WORK_OF_ART']

ch12/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@ spacy==2.3.2
22
neo4j>=4.0,<4.2
33
pytextrank
44
cython
5+
pandas
6+
#neuralcoref
57
git+https://github.com/huggingface/neuralcoref.git

ch12/text_processors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,9 @@ def extract_relationships(self, document_id, rules):
216216
MATCH (verb:TagOccurrence {pos: "VBD"})
217217
WHERE verb.lemma IN rule.verbs
218218
WITH verb, rule
219-
MATCH (verb)-[:IS_DEPENDENT {type:"nsubj"}]->(subject)-[:PARTICIPATE_IN]->(subjectNe:NamedEntity)
219+
MATCH (verb)-[:IS_DEPENDENT {type:"nsubj"}]->(subject)-[:PARTICIPATES_IN]->(subjectNe:NamedEntity)
220220
WHERE subjectNe.type IN rule.subjectTypes
221-
MATCH (verb)-[:IS_DEPENDENT {type:"dobj"}]->(object)-[:PARTICIPATE_IN]->(objectNe:NamedEntity {type: "WORK_OF_ART"})
221+
MATCH (verb)-[:IS_DEPENDENT {type:"dobj"}]->(object)-[:PARTICIPATES_IN]->(objectNe:NamedEntity {type: "WORK_OF_ART"})
222222
WHERE objectNe.type IN rule.objectTypes
223223
WITH verb, subjectNe, objectNe, rule
224224
MERGE (subjectNe)-[:IS_RELATED_TO {root: verb.lemma, type: rule.type}]->(objectNe)

util/sparse_vector.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
import math
22

3+
def convert_sparse_vector(numbers):
4+
vector_dict = {}
5+
for k, c in enumerate(numbers):
6+
if c:
7+
vector_dict[k] = c
8+
return vector_dict
39

410
def cosine_similarity(vectA, vectB):
511
a = dot(vectA, vectB);
@@ -39,3 +45,9 @@ def dot(vect_a, vect_b):
3945

4046
def norm(vect):
4147
return math.sqrt(dot(vect, vect))
48+
49+
50+
if __name__ == '__main__':
51+
print(convert_sparse_vector([1,0,0,1,0,0])) #{0: 1, 3: 1}
52+
print(convert_sparse_vector([1, 1, 0, 0, 0, 0])) #{0: 1, 1: 1}
53+
print(convert_sparse_vector([1, 1, 0, 0, 0, 1])) #{0: 1, 1: 1, 5: 1}

0 commit comments

Comments
 (0)