vecorro
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎__init__.py
+1 b/‎__init__.py
+1
diff --git a/‎ch04/imports/movielens/README.md
+15-11 b/‎ch04/imports/movielens/README.md
+15-11
diff --git a/‎ch04/imports/movielens/requirements.txt
+1-1 b/‎ch04/imports/movielens/requirements.txt
+1-1
diff --git a/‎ch04/recommendation/content_based_recommendation_second_approach.py
+4-4 b/‎ch04/recommendation/content_based_recommendation_second_approach.py
+4-4
diff --git a/‎ch04/recommendation/content_based_recommendation_third_approach.py
+4-4 b/‎ch04/recommendation/content_based_recommendation_third_approach.py
+4-4
diff --git a/‎ch05/imports/retail_rocket/README.md
+1-1 b/‎ch05/imports/retail_rocket/README.md
+1-1
diff --git a/‎ch05/imports/retail_rocket/import_retail_rocket_ui.py
+8-6 b/‎ch05/imports/retail_rocket/import_retail_rocket_ui.py
+8-6
diff --git a/‎ch05/imports/retail_rocket/requirements.txt
+1-1 b/‎ch05/imports/retail_rocket/requirements.txt
+1-1
diff --git a/‎ch05/recommendation/collaborative_filtering/recommender.py
-1 b/‎ch05/recommendation/collaborative_filtering/recommender.py
-1
diff --git a/‎ch05/recommendation/requirements.txt
+1-1 b/‎ch05/recommendation/requirements.txt
+1-1
diff --git a/‎ch06/imports/yoochoose/Makefile
+6-1 b/‎ch06/imports/yoochoose/Makefile
+6-1
diff --git a/‎ch06/imports/yoochoose/README.md
+34-1 b/‎ch06/imports/yoochoose/README.md
+34-1
diff --git a/‎ch06/imports/yoochoose/import_yoochoose.py
+5-5 b/‎ch06/imports/yoochoose/import_yoochoose.py
+5-5
diff --git a/‎ch06/imports/yoochoose/import_yoochoose_advanced.py
+7-7 b/‎ch06/imports/yoochoose/import_yoochoose_advanced.py
+7-7
diff --git a/‎ch06/imports/yoochoose/requirements.txt
+1-1 b/‎ch06/imports/yoochoose/requirements.txt
+1-1
diff --git a/‎ch06/recommendation/requirements.txt
+2-1 b/‎ch06/recommendation/requirements.txt
+2-1
diff --git a/‎ch06/recommendation/session_based_recommendation_iknn_approach.py
+4-4 b/‎ch06/recommendation/session_based_recommendation_iknn_approach.py
+4-4
@@ -3,3 +3,6 @@
 /ch11/masc_word_sense_sentence_corpus.V1.0/
 /.venv/
 /.idea/
+/ch06/imports/yoochoose/dataset-README.txt
+ch06/imports/yoochoose/yoochoose-*.dat
+/ch06/imports/yoochoose/yoochoose-data.7z
@@ -0,0 +1 @@
+__all__ = ["util", "ch12"]
@@ -4,38 +4,42 @@
 The import scripts requires some specific library such as imdbpy which allows to access to ImDB (https://www.imdb.com/) api.
 To install what is necessary run:
 
-`#make`
+```sh
+make
+```
 
 ## Download the data source
 The Makefile contains also the command to download the necessary data sources.
 Run:
 
-`#make source`
+```sh
+make source
+```
 
-If you would like to download it manually the URL is:
-
-http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
+You can also download it manually from [project's site](http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)
 
 The default location is in the home of this code repository in the directory datasets
-(eventually in the movielens subdirectory). 
+(eventually in the `movielens` subdirectory). 
 
 
 ## Run the import
 
-`python import_movielens.py -u <neo4j username> -p <password>  -b <bolt uri> -s <source directory>`
+```sh
+python import_movielens.py -u <neo4j username> -p <password>  -b <bolt uri> -s <source directory>
+```
 
 If you used the makefile for downloading the directory you don't need to specify the datasource. 
 The simple version takes a while to be completed. I recommend to run the parallel version as follows:
 
-`python import_movielens_parallel.py -u <neo4j username> -p <password>  -b <bolt uri> -s <source directory>`
+```sh
+python import_movielens_parallel.py -u <neo4j username> -p <password>  -b <bolt uri> -s <source directory>
+```
 
 ## Note during the import
 
 Note that IMDB imposes some constraints for the access to its API. Due to this, if the machine is powerful enough 
 it can happen that it will start rejecting the requests. It is perfectly normal. 
 
-After the chapter has been released the full version of the IMDB has been released here:
-    
-https://www.imdb.com/interfaces/
+After the chapter has been released the full version of the IMDB has been released [here](https://www.imdb.com/interfaces/).
 
 In the future I'll make some changes in order to load from files instead.
@@ -1,3 +1,3 @@
-neo4j==4.2.0
+neo4j>=4.0,<4.3
 imdbpy==2020.9.25
 nose==1.3.7
@@ -22,7 +22,7 @@ def __init__(self, uri, user, password):
     def recommend_to(self, userId, k):
         user_VSM = self.get_user_vector(userId)
         movies_VSM, titles = self.get_movie_vectors(userId)
-        top_k = self.compute_top_k (user_VSM, movies_VSM, k);
+        top_k = self.compute_top_k (user_VSM, movies_VSM, k)
         results = []
         for movie in top_k:
             item = {}
@@ -92,8 +92,8 @@ def get_movie_vectors(self, user_id):
 
             i = 0
             for movie in tx.run(list_of_moview_query, {"userId": user_id}):
-                movie_id = movie["movieId"];
-                title = movie["title"];
+                movie_id = movie["movieId"]
+                title = movie["title"]
                 vector = tx.run(query, {"movieId": movie_id})
                 movies_VSM[movie_id] = vector.single()[0]
                 titles[movie_id] = title
@@ -115,6 +115,6 @@ def get_movie_vectors(self, user_id):
         print(__file__ , "Specify the user with -t <user id>")
         print("Setting the default to:", target_user)
     recommender = ContentBasedRecommenderSecondApproach(uri=uri, user=neo4j_user, password=neo4j_password)
-    top10 = recommender.recommend_to(target_user, 10); #Replace 598 with any other user id you are interested in
+    top10 = recommender.recommend_to(target_user, 10) #Replace 598 with any other user id you are interested in
     print(top10)
 
@@ -12,7 +12,7 @@ def compute_and_store_similarity(self):
         movies_VSM = self.get_movie_vectors()
         i = 0
         for movie in movies_VSM:
-            knn = self.compute_knn(movie, movies_VSM.copy(), 10);
+            knn = self.compute_knn(movie, movies_VSM.copy(), 10)
             self.store_knn(movie, knn)
             # would be useful to add a progress bar here as well...
             i += 1
@@ -53,7 +53,7 @@ def get_movie_vectors(self):
 
             i = 0
             for movie in tx.run(list_of_moview_query):
-                movie_id = movie["movieId"];
+                movie_id = movie["movieId"]
                 vector = tx.run(query, {"movieId": movie_id})
                 movies_VSM[movie_id] = vector.single()[0]
                 i += 1
@@ -110,6 +110,6 @@ def recommendTo(self, user_id, k):
     uri = "bolt://localhost:7687"
     recommender = ContentBasedRecommender(uri=uri, user="neo4j", password="pippo1")
     # would be nice to have a control of execution - like, recalculate everything only if specific flag is set, or something like
-    recommender.compute_and_store_similarity();
-    top10 = recommender.recommendTo("598", 10);
+    recommender.compute_and_store_similarity()
+    top10 = recommender.recommendTo("598", 10)
     print(top10)
@@ -18,7 +18,7 @@ pip install -r requirements.txt
 ## Download & import the dataset
 
 
-For this chapter we're using the [Retailrocket recommender system dataset](https://www.kaggle.com/retailrocket/ecommerce-dataset?select=item_properties_part1.csv) available at Kaggle.  From this dataset we need only the `events.csv` file (you can download the whole dataset), download it and put it into some directory.
+For this chapter we're using the [Retailrocket recommender system dataset](https://www.kaggle.com/retailrocket/ecommerce-dataset) available from Kaggle.  From this dataset we need only the `events.csv` file (you can download the whole dataset), download it and put it into some directory.
 
 Importing of data is performed with following command (you may need to update Neo4j username & password in the file):
 
 
@@ -3,6 +3,7 @@
 from neo4j import GraphDatabase
 import sys
 
+
 class RetailRocketImporter(object):
 
     def __init__(self, uri, user, password):
@@ -16,7 +17,7 @@ def executeNoException(self, session, query):
             session.run(query)
         except Exception as e:
             pass
-        
+
     def import_user_item(self, file):
         with open(file, 'r+') as in_file:
             reader = csv.reader(in_file, delimiter=',')
@@ -27,8 +28,8 @@ def import_user_item(self, file):
                 self.executeNoException(session, "CREATE CONSTRAINT ON (u:Item) ASSERT u.itemId IS UNIQUE")
 
                 tx = session.begin_transaction()
-                i = 0;
-                j = 0;
+                i = 0
+                j = 0
                 query = """
                     MERGE (item:Item {itemId: $itemId})
                     MERGE (user:User {userId: $userId})
@@ -43,7 +44,7 @@ def import_user_item(self, file):
                             item_id = strip(row[3])
 
                             if event_type == "transaction":
-                                tx.run(query, {"itemId":item_id, "userId": user_id,  "timestamp": timestamp})
+                                tx.run(query, {"itemId": item_id, "userId": user_id, "timestamp": timestamp})
                                 i += 1
                                 j += 1
                                 if i == 1000:
@@ -59,13 +60,14 @@ def import_user_item(self, file):
 
 def strip(string): return ''.join([c if 0 < ord(c) < 128 else ' ' for c in string])
 
+
 if __name__ == '__main__':
     start = time.time()
     uri = "bolt://localhost:7687"
     user = "neo4j"
-    password = "q1" # pippo1
+    password = "q1"  # pippo1
     file_path = "/Users/ale/neo4j-servers/gpml/dataset/retailrocket-recommender-system-dataset/events.csv"
-    if (len(sys.argv) > 1):
+    if len(sys.argv) > 1:
         file_path = sys.argv[1]
     importing = RetailRocketImporter(uri=uri, user=user, password=password)
     importing.import_user_item(file=file_path)
 
@@ -1 +1 @@
-neo4j
+neo4j>=4.0,<4.3
@@ -1,4 +1,3 @@
-import itertools
 from enum import Enum
 from typing import Dict, List
 
 
@@ -1 +1 @@
-neo4j
+neo4j>=4.0,<4.3
@@ -2,7 +2,12 @@ init:
 	pip install -U -r requirements.txt
 
 get_data:
-	curl -L -o yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
+	if [ ! -f  yoochoose-data.7z ]; then \
+		curl -L -o yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z ;\
+	fi; \
+	if [ ! -f yoochoose-buys.dat ]; then \
+		7z x yoochoose-data.7z; \
+	fi
 
 test:
 	nosetests tests
@@ -1 +1,34 @@
-make
+This directory contains the code for importing of data that are used in the chapter 6 of the book.
+
+## Install dependencies
+
+To install all necessary dependencies just run the:
+
+```sh
+make
+```
+
+or 
+
+```sh
+pip install -r requirements.txt
+```
+
+
+## Download & import the dataset
+
+
+For this chapter we're using the [Youchoose dataset](https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z). To download & unpack it just type (you need to have 7Zip binary installed):
+
+```sh
+make get_data
+```
+
+Importing of data is performed with following command (you may need to update Neo4j username & password in the file):
+
+```sh
+python import_yoochoose.py path_to_youchoose_dataset
+
+```
+
+If you used `make get_data`, use `.` for `path_to_youchoose_dataset`
@@ -24,7 +24,7 @@ def import_session_data(self, file):
             self.executeNoException(session, "CREATE CONSTRAINT ON (s:Session) ASSERT s.sessionId IS UNIQUE")
             self.executeNoException(session, "CREATE CONSTRAINT ON (i:Item) ASSERT i.itemId IS UNIQUE")
             dtype = {"sessionID": np.int64, "itemID": np.int64, "category": np.object}
-            j = 0;
+            j = 0
             for chunk in pd.read_csv(file,
                                      header=0,
                                      dtype=dtype,
@@ -33,7 +33,7 @@ def import_session_data(self, file):
                                      chunksize=10**6):
                 df = chunk
                 tx = session.begin_transaction()
-                i = 0;
+                i = 0
                 query = """
                         MERGE (session:Session {sessionId: $sessionId})
                         MERGE (item:Item {itemId: $itemId, category: $category})
@@ -68,7 +68,7 @@ def import_session_data(self, file):
     def import_buys_data(self, file):
         with self._driver.session() as session:
             dtype = {"sessionID": np.int64, "itemID": np.int64, "price": np.float, "quantity": np.int}
-            j = 0;
+            j = 0
             for chunk in pd.read_csv(file,
                                      header=0,
                                      dtype=dtype,
@@ -77,7 +77,7 @@ def import_buys_data(self, file):
                                      chunksize=10**6):
                 df = chunk
                 tx = session.begin_transaction()
-                i = 0;
+                i = 0
                 query = """
                         MATCH (session:Session {sessionId: $sessionId})
                         MATCH (item:Item {itemId: $itemId})
@@ -115,7 +115,7 @@ def strip(string): return ''.join([c if 0 < ord(c) < 128 else ' ' for c in strin
     user = "neo4j"
     password = "q1" # pippo1
     base_path = "/Users/ale/neo4j-servers/gpml/dataset/yoochoose-data"
-    if (len(sys.argv) > 1):
+    if len(sys.argv) > 1:
         base_path = sys.argv[1]
     importing = YoochooseImporter(uri=uri, user=user, password=password)
     importing.import_session_data(file=base_path + "/yoochoose-clicks.dat")
 
@@ -22,7 +22,7 @@ def executeNoException(self, session, query):
 
     def import_session_data(self, file):
         dtype = {"sessionID": np.int64, "itemID": np.int64, "category": np.object}
-        j = 0;
+        j = 0
         sess_clicks = {}
         for chunk in pd.read_csv(file,
                                  header=0,
@@ -66,7 +66,7 @@ def import_session_data(self, file):
             self.executeNoException(session, "CREATE CONSTRAINT ON (i:Item) ASSERT i.itemId IS UNIQUE")
 
             tx = session.begin_transaction()
-            i = 0;
+            i = 0
             j = 0
             query = """
                 CREATE (session:Session {sessionId: $sessionId})
@@ -101,8 +101,8 @@ def import_session_data(self, file):
     def import_buys_data(self, file, sess_clicks):
         with self._driver.session() as session:
             dtype = {"sessionID": np.int64, "itemID": np.int64, "price": np.float, "quantity": np.int}
-            i = 0;
-            j = 0;
+            i = 0
+            j = 0
             query = """
                 MATCH (session:Session {sessionId: $sessionId})
                 MATCH (item:Item {itemId: $itemId})
@@ -146,8 +146,8 @@ def post_processing(self, sess_clicks):
         print("start post processing")
         with self._driver.session() as session:
             tx = session.begin_transaction()
-            i = 0;
-            j = 0;
+            i = 0
+            j = 0
             post_processing_query = """
                 MATCH (s:Session {sessionId: $sessionId})-[:CONTAINS]->(click)
                 WITH s, click
@@ -185,7 +185,7 @@ def strip(string): return ''.join([c if 0 < ord(c) < 128 else ' ' for c in strin
     user = "neo4j"
     password = "q1" # pippo1
     base_path = "/Users/ale/neo4j-servers/gpml/dataset/yoochoose-data"
-    if (len(sys.argv) > 1):
+    if len(sys.argv) > 1:
         base_path = sys.argv[1]
     importer = YoochooseImporter(uri=uri, user=user, password=password)
 
 
@@ -1,2 +1,2 @@
-neo4j
+neo4j>=4.0,<4.3
 pandas
@@ -1,3 +1,4 @@
-neo4j
+neo4j>=4.0,<4.3
 pandas
 sklearn
+annoy
@@ -15,7 +15,7 @@ def close(self):
     def compute_and_store_similarity(self):
         items_VSM = self.get_item_vectors()
         for item in items_VSM:
-            knn = self.compute_knn(item, items_VSM.copy(), 20);
+            knn = self.compute_knn(item, items_VSM.copy(), 20)
             self.store_knn(item, knn)
 
     def compute_knn(self, item, items, k):
@@ -47,7 +47,7 @@ def get_item_vectors(self):
         with self._driver.session() as session:
             i = 0
             for item in session.run(list_of_items_query):
-                item_id = item["itemId"];
+                item_id = item["itemId"]
                 vector = session.run(query, {"itemId": item_id})
                 items_VSM_sparse[item_id] = vector.single()[0]
                 i += 1
@@ -100,7 +100,7 @@ def recommend_to(self, item_id, k):
     user = "neo4j"
     password = "q1" # pippo1
     recommender = SessionBasedRecommender(uri=uri, user=user, password=password)
-    recommender.compute_and_store_similarity();
-    top10 = recommender.recommend_to(214842060, 10);
+    recommender.compute_and_store_similarity()
+    top10 = recommender.recommend_to(214842060, 10)
     recommender.close()
     print(top10)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import itertools`
`2`	`1`	`from enum import Enum`
`3`	`2`	`from typing import Dict, List`
`4`	`3`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-neo4j`
	`1`	`+neo4j>=4.0,<4.3`
`2`	`2`	`pandas`
-Original file line number
+Diff line change
 -neo4j
 +neo4j>=4.0,<4.3
 pandas
 sklearn
 +annoy