further improvements

alenegro81 · alenegro81 · commit c4ee8846bf94 · 2021-02-08T07:15:36.000+01:00
diff --git a/ch05/recommendation/collaborative_filtering/recommender.py b/ch05/recommendation/collaborative_filtering/recommender.py
@@ -9,7 +9,7 @@
 from util.sparse_vector import cosine_similarity
 
 
-class BaseRecomander(object):
+class BaseRecommender(object):
     label = None
     property = None
     sparse_vector_query = None
@@ -109,7 +109,7 @@ def get_score(self, user_id: str, item_id: str) -> float:
         return result[0]
 
 
-class UserRecomander(BaseRecomander):
+class UserRecommender(BaseRecommender):
     label = "User"
     property = "userId"
     sparse_vector_query = """
@@ -127,7 +127,7 @@ class UserRecomander(BaseRecomander):
     """
 
 
-class ItemRecomander(BaseRecomander):
+class ItemRecommender(BaseRecommender):
     label = "User"
     property = "userId"
     sparse_vector_query = """
@@ -149,9 +149,9 @@ class KNNType(Enum):
 
     def __init__(self, uri: str, user: str, password: str):
         self._driver = GraphDatabase.driver(uri, auth=(user, password), encrypted=0)
-        self.strategies: Dict[Recommender.KNNType, BaseRecomander] = {
-            Recommender.KNNType.USER: UserRecomander(uri, user, password),
-            Recommender.KNNType.ITEM: ItemRecomander(uri, user, password)
+        self.strategies: Dict[Recommender.KNNType, BaseRecommender] = {
+            Recommender.KNNType.USER: UserRecommender(uri, user, password),
+            Recommender.KNNType.ITEM: ItemRecommender(uri, user, password)
         }
 
     def compute_and_store_KNN(self, type_: KNNType) -> None:
diff --git a/ch06/imports/yoochoose/import_yoochoose.py b/ch06/imports/yoochoose/import_yoochoose.py
@@ -39,15 +39,8 @@ def import_session_data(self, file):
                         MERGE (item:Item {itemId: $itemId, category: $category})
                         CREATE (click:Click {timestamp: $timestamp})
                         CREATE (session)-[:CONTAINS]->(click)
-                        CREATE (click)-[:RELATED_TO]->(item)
+                        CREATE (click)-[:IS_RELATED_TO]->(item)
                     """
-                #        CREATE (session)-[:LAST_CLICK]->(click)
-
-                #        WITH click, session
-                #        MATCH (session)-[r:LAST_CLICK]->(lastClick:Click)
-                #        WHERE id(click) <> id(lastClick)
-                #        CREATE (lastClick)-[:NEXT]->(click)
-                #        DELETE r
 
                 for row in df.itertuples():
                     try:
@@ -90,14 +83,8 @@ def import_buys_data(self, file):
                         MATCH (item:Item {itemId: $itemId})
                         CREATE (buy:Buy:Click {timestamp: $timestamp})
                         CREATE (session)-[:CONTAINS]->(buy)
-                        CREATE (buy)-[:RELATED_TO]->(item)
+                        CREATE (buy)-[:IS_RELATED_TO]->(item)
                 """
-                #        CREATE (session)-[:LAST_CLICK]->(buy)
-                #        WITH buy, session
-                #        MATCH (session)-[r:LAST_CLICK]->(lastClick:Click)
-                #        WHERE id(buy) <> id(lastClick)
-                #        CREATE (lastClick)-[:NEXT]->(buy)
-                #        DELETE r
 
                 for row in df.itertuples():
                     try:
@@ -117,9 +104,6 @@ def import_buys_data(self, file):
                 tx.commit()
                 print(j, "lines processed")
             print(j, "lines processed")
-            # tx = session.begin_transaction()
-            # query to set the last transaction
-            # tx.commit
 
 
 def strip(string): return ''.join([c if 0 < ord(c) < 128 else ' ' for c in string])
diff --git a/ch06/imports/yoochoose/import_yoochoose_advanced.py b/ch06/imports/yoochoose/import_yoochoose_advanced.py
@@ -74,7 +74,7 @@ def import_session_data(self, file):
                 UNWIND $items as entry
                 MERGE (item:Item {itemId: entry.itemId, category: entry.category})
                 CREATE (click:Click {timestamp: entry.timestamp})
-                CREATE (click)-[:RELATED_TO]->(item)
+                CREATE (click)-[:IS_RELATED_TO]->(item)
                 CREATE (session)-[:CONTAINS]->(click)
             """
             for session_id in list(sess_clicks):
@@ -108,7 +108,7 @@ def import_buys_data(self, file, sess_clicks):
                 MATCH (item:Item {itemId: $itemId})
                 CREATE (buy:Buy:Click {timestamp: $timestamp})
                 CREATE (session)-[:CONTAINS]->(buy)
-                CREATE (buy)-[:RELATED_TO]->(item)
+                CREATE (buy)-[:IS_RELATED_TO]->(item)
             """
             for chunk in pd.read_csv(file,
                                      header=0,
diff --git a/ch06/recommendation/session_based_recommendation_iknn_approach.py b/ch06/recommendation/session_based_recommendation_iknn_approach.py
@@ -39,7 +39,7 @@ def get_item_vectors(self):
                 """
 
         query = """
-                    MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
+                    MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
                     WHERE item.itemId = $itemId
                     WITH session 
                     ORDER BY id(session)
@@ -55,7 +55,7 @@ def get_item_vectors(self):
                 i += 1
                 if i % 100 == 0:
                     print(i, "rows processed")
-            print(i, "lines processed")
+            print(i, "rows processed")
         print(len(items_VSM_sparse))
         return items_VSM_sparse
 
diff --git a/ch06/recommendation/session_based_recommendation_iknn_approach_advanced.py b/ch06/recommendation/session_based_recommendation_iknn_approach_advanced.py
@@ -89,7 +89,7 @@ def get_item_vectors(self):
                 """
 
         query = """
-                    MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
+                    MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
                     WHERE item.itemId = $itemId
                     WITH session 
                     order by click.timestamp desc
diff --git a/ch06/recommendation/session_based_recommendation_sknn_approach.py b/ch06/recommendation/session_based_recommendation_sknn_approach.py
@@ -39,7 +39,7 @@ def get_session_vectors(self):
                 """
 
         query = """
-                    MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
+                    MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
                     WHERE session.sessionId = $sessionId
                     WITH item 
                     ORDER BY id(item)
@@ -56,7 +56,7 @@ def get_session_vectors(self):
                 if i % 100 == 0:
                     print(i, "rows processed")
 
-            print(i, "lines processed")
+            print(i, "rows processed")
         print(len(sessions_VSM_sparse))
         return sessions_VSM_sparse
 
@@ -84,7 +84,7 @@ def store_knn(self, session_id, knn):
     def recommend_to(self, session_id, k):
         top_items = []
         query = """
-            MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:RELATED_TO]->(item:Item) 
+            MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:IS_RELATED_TO]->(item:Item) 
             WHERE target.sessionId = $sessionId
             WITH DISTINCT item.itemId as itemId, r
             RETURN itemId, sum(r.weight) as score
diff --git a/ch06/recommendation/session_based_recommendation_sknn_approach_advanced.py b/ch06/recommendation/session_based_recommendation_sknn_approach_advanced.py
@@ -87,7 +87,7 @@ def get_session_vectors(self):
                 """
 
         query = """
-                    MATCH (item:Item)<-[:RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
+                    MATCH (item:Item)<-[:IS_RELATED_TO]-(click:Click)<-[:CONTAINS]-(session:Session)
                     WHERE session.sessionId = $sessionId
                     WITH item 
                     order by click.timestamp desc
@@ -135,7 +135,7 @@ def store_knn(self, session_id, knn):
     def recommend_to(self, session_id, k):
         top_items = []
         query = """
-            MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:RELATED_TO]->(item:Item) 
+            MATCH (target:Session)-[r:SIMILAR_TO]->(d:Session)-[:CONTAINS]->(:Click)-[:IS_RELATED_TO]->(item:Item) 
             WHERE target.sessionId = $sessionId
             WITH DISTINCT item.itemId as itemId, r
             RETURN itemId, sum(r.weight) as score
diff --git a/ch06/recommendation/similarity_item_example.py b/ch06/recommendation/similarity_item_example.py
@@ -1,5 +1,6 @@
 from sklearn.metrics.pairwise import cosine_similarity
 
+#Vector representation of the items
 item3 = [0,1,0,0,1]
 item7 = [1,0,1,0,1]
 item9 = [0,1,1,1,1]
@@ -11,7 +12,7 @@
 item346 = [1,0,1,1,1]
 item562 = [1,0,0,0,0]
 
-print("....")
+# Compute and print relevant similarities
 print(cosine_similarity([item12], [item23]))
 print(cosine_similarity([item12], [item65]))
 print(cosine_similarity([item12], [item85]))
diff --git a/ch07/imports/depaulmovie/README.md b/ch07/imports/depaulmovie/README.md
@@ -0,0 +1,5 @@
+We need to have a README for that piece of code – how to pass path to data, username/password, where to take data, etc.
+
+
+We need to give link right here, so people can click it & load the data. Reference section also doesn’t contain the link, so people will need to go to .
+I’ve found https://github.com/JDonini/DePaulMovie-Recommender-System, but not 100% sure that it’s correct
diff --git a/ch07/recommendations/context_aware_recommendation_approach.py b/ch07/recommendations/context_aware_recommendation_approach.py
@@ -74,7 +74,7 @@ def get_item_vectors(self, context):
                 i += 1
                 if i % 100 == 0:
                     print(i, "rows processed")
-            print(i, "lines processed")
+            print(i, "rows processed")
         print(len(items_VSM_sparse))
         return items_VSM_sparse
 
diff --git a/ch08/queries/simple_ring_fraud.cypher b/ch08/queries/simple_ring_fraud.cypher
@@ -1,7 +1,7 @@
 //simple model, run it one by one
-CREATE CONSTRAINT ON (s:Email) ASSERT (s.value) IS NODE KEY;
-CREATE CONSTRAINT ON (s:PhoneNumber) ASSERT (s.value) IS NODE KEY;
-CREATE CONSTRAINT ON (s:Address) ASSERT (s.value) IS NODE KEY;
+CREATE CONSTRAINT ON (s:Email) ASSERT (s.value) IS UNIQUE;
+CREATE CONSTRAINT ON (s:PhoneNumber) ASSERT (s.value) IS UNIQUE;
+CREATE CONSTRAINT ON (s:Address) ASSERT (s.value) IS UNIQUE;
 
 //run it all at once
 CREATE (alenegro:User {accountId: "49295987202", username: "alenegro", email: "mpd7xg@tim.it", name:"Hilda J Womack", phone_number: "580-548-1149", address: "4093 Cody Ridge Road - Enid, OK"})
diff --git a/ch08/queries/simple_ring_fraud_IP.cypher b/ch08/queries/simple_ring_fraud_IP.cypher
@@ -1,4 +1,4 @@
-CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS NODE KEY;
+CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS UNIQUE;
 
 
 //run it all at once
diff --git a/ch09/analysis/distance_based_analysis.py b/ch09/analysis/distance_based_analysis.py
@@ -13,30 +13,28 @@ def __init__(self, uri, user, password):
     def close(self):
         self._driver.close()
 
-    def compute_and_store_distances(self, k, exact):
+    def compute_and_store_distances(self, k, exact, distance_function, relationship_name):
         start = time.time()
         data, data_labels = self.get_transaction_vectors()
         print("Time to get vectors:", time.time() - start)
         start = time.time()
-        #selected_feature = np.loadtxt("array.txt")
-        #new_data = [np.multiply(vector, selected_feature).tolist() for vector in data]
+
         if exact:
-            ann_labels, ann_distances = self.compute_knn(data, data_labels, k)
-            label = "DISTANT_FROM_EXACT"
+            ann_labels, ann_distances = self.compute_knn(data, data_labels, k, distance_function)
         else:
-            ann_labels, ann_distances = self.compute_ann(data, data_labels, k)
-            label = "DISTANT_FROM"
+            ann_labels, ann_distances = self.compute_ann(data, data_labels, k, distance_function)
+
         print("Time to compute nearest neighbors:", time.time() - start)
         start = time.time()
-        self.store_ann(data_labels, ann_labels, ann_distances, label)
+        self.store_ann(data_labels, ann_labels, ann_distances, relationship_name)
         print("Time to store nearest neighbors:", time.time() - start)
         print("done")
 
-    def compute_ann(self, data, data_labels, k):
+    def compute_ann(self, data, data_labels, k, distance_function):
         dim = len(data[0])
         num_elements = len(data_labels)
         # Declaring index
-        p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+        p = hnswlib.Index(space=distance_function, dim=dim)  # possible options for ditance_formula are l2, cosine or ip
         # Initing index - the maximum number of elements should be known beforehand
         p.init_index(max_elements=num_elements, ef_construction=800, M=200)
         # Element insertion (can be called several times):
@@ -47,9 +45,9 @@ def compute_ann(self, data, data_labels, k):
         labels, distances = p.knn_query(data, k = k)
         return labels, distances
 
-    def compute_knn(self, data, data_labels, k):
+    def compute_knn(self, data, data_labels, k, distance_function):
         pre_processed_data = [np.array(item) for item in data]
-        nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='mahalanobis', n_jobs=-1).fit(pre_processed_data)
+        nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=distance_function, n_jobs=-1).fit(pre_processed_data)
         knn_distances, knn_labels = nbrs.kneighbors(pre_processed_data)
         distances = knn_distances
         labels = [[data_labels[element] for element in item] for item in knn_labels]
@@ -114,8 +112,11 @@ def store_ann(self, data_labels, ann_labels, ann_distances, label): #ADD the opp
 
 if __name__ == '__main__':
     uri = "bolt://localhost:7687"
+    distance_formula_value = "l2" #'mahalanobis' for exact
+    #relationship_name_value = "DISTANT_FROM_EXACT"
+    relationship_name_value = "DISTANT_FROM"
     analyzer = DistanceBasedAnalysis(uri=uri, user="neo4j", password="q1")
-    analyzer.compute_and_store_distances(25, False);
+    analyzer.compute_and_store_distances(25, False, distance_formula_value, relationship_name_value);
     # Uncomment this line to calculate exact NNs, but it will take a lot of time!
     # analyzer.compute_and_store_distances(25, True);
     analyzer.close()
diff --git a/ch10/analysis/DegreeAnalysis.xlsx b/ch10/analysis/DegreeAnalysis.xlsx
diff --git a/ch12/06_spacy_entity_relationship_extraction.py b/ch12/06_spacy_entity_relationship_extraction.py
@@ -49,7 +49,7 @@ def tokenize_and_store(self, text, text_id, storeTag):
             self.__text_processor.build_entities_inferred_graph(text_id)
             rules = [
                 {
-                    'type': 'RECEIVE_PRICE',
+                    'type': 'RECEIVE_PRIZE',
                     'verbs': ['receive'],
                     'subjectTypes': ['PERSON', 'NP'],
                     'objectTypes': ['WORK_OF_ART']
diff --git a/ch12/requirements.txt b/ch12/requirements.txt
@@ -2,4 +2,6 @@ spacy==2.3.2
 neo4j>=4.0,<4.2
 pytextrank
 cython
+pandas
+#neuralcoref
 git+https://github.com/huggingface/neuralcoref.git
diff --git a/ch12/text_processors.py b/ch12/text_processors.py
@@ -216,9 +216,9 @@ def extract_relationships(self, document_id, rules):
             MATCH (verb:TagOccurrence {pos: "VBD"})
             WHERE verb.lemma IN rule.verbs
             WITH verb, rule
-            MATCH (verb)-[:IS_DEPENDENT {type:"nsubj"}]->(subject)-[:PARTICIPATE_IN]->(subjectNe:NamedEntity)
+            MATCH (verb)-[:IS_DEPENDENT {type:"nsubj"}]->(subject)-[:PARTICIPATES_IN]->(subjectNe:NamedEntity)
             WHERE subjectNe.type IN rule.subjectTypes
-            MATCH (verb)-[:IS_DEPENDENT {type:"dobj"}]->(object)-[:PARTICIPATE_IN]->(objectNe:NamedEntity {type: "WORK_OF_ART"})
+            MATCH (verb)-[:IS_DEPENDENT {type:"dobj"}]->(object)-[:PARTICIPATES_IN]->(objectNe:NamedEntity {type: "WORK_OF_ART"})
             WHERE objectNe.type IN rule.objectTypes
             WITH verb, subjectNe, objectNe, rule
             MERGE (subjectNe)-[:IS_RELATED_TO {root: verb.lemma, type: rule.type}]->(objectNe)
diff --git a/util/sparse_vector.py b/util/sparse_vector.py
@@ -1,5 +1,11 @@
 import math
 
+def convert_sparse_vector(numbers):
+    vector_dict = {}
+    for k, c in enumerate(numbers):
+        if c:
+            vector_dict[k] = c
+    return vector_dict
 
 def cosine_similarity(vectA, vectB):
     a = dot(vectA, vectB);
@@ -39,3 +45,9 @@ def dot(vect_a, vect_b):
 
 def norm(vect):
     return math.sqrt(dot(vect, vect))
+
+
+if __name__ == '__main__':
+    print(convert_sparse_vector([1,0,0,1,0,0])) #{0: 1, 3: 1}
+    print(convert_sparse_vector([1, 1, 0, 0, 0, 0])) #{0: 1, 1: 1}
+    print(convert_sparse_vector([1, 1, 0, 0, 0, 1])) #{0: 1, 1: 1, 5: 1}

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS NODE KEY;`
	`1`	`+CREATE CONSTRAINT ON (s:IPIstance) ASSERT (s.id) IS UNIQUE;`
`2`	`2`
`3`	`3`
`4`	`4`	`//run it all at once`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ def tokenize_and_store(self, text, text_id, storeTag):`
`49`	`49`	`self.__text_processor.build_entities_inferred_graph(text_id)`
`50`	`50`	`rules = [`
`51`	`51`	`{`
`52`		`- 'type': 'RECEIVE_PRICE',`
	`52`	`+ 'type': 'RECEIVE_PRIZE',`
`53`	`53`	`'verbs': ['receive'],`
`54`	`54`	`'subjectTypes': ['PERSON', 'NP'],`
`55`	`55`	`'objectTypes': ['WORK_OF_ART']`