diff --git a/Ch4/01_OnePipeline_ManyClassifiers.ipynb b/Ch4/01_OnePipeline_ManyClassifiers.ipynb index 87ea672..b5b5122 100644 --- a/Ch4/01_OnePipeline_ManyClassifiers.ipynb +++ b/Ch4/01_OnePipeline_ManyClassifiers.ipynb @@ -1,736 +1,744 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "janWv1vG5xUD" - }, - "source": [ - "# Text Classification with Naive Bayes, Logistic Regression, SVM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gBCjEALX5xWj" - }, - "source": [ - "**Overview:** This notebook aims to give you a brief overview of performing text classification using Naive Bayes, Logistic Regression and Support Vector Machines. We will be using a dataset called \"Economic news article tone and relevance\" from [Figure-Eight](https://github.com/practical-nlp/practical-nlp/blob/master/Ch4/Data/Full-Economic-News-DFE-839861.csv) which consists of approximately 8000 news articles, which were tagged as relevant or not relevant to the US Economy. Our goal in this notebook is to explore the process of training and testing text classifiers for this problem, using this data set and two text classification algorithms: Multinomial Naive Bayes and Logistic Regression, implemented in sklearn. \n", - "\n", - "##### Dataset Link: In the a folder called Data in folder Ch4 of this repo\n", - "

\n", - "Let's import few necessary packages before we start our work" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "janWv1vG5xUD" + }, + "source": [ + "# Text Classification with Naive Bayes, Logistic Regression, SVM" + ] }, - "id": "Mee0VQbBXDto", - "outputId": "459d0120-aa17-4536-bc9e-e2395bfa6886" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting scikit-learn==0.21.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9f/c5/e5267eb84994e9a92a2c6a6ee768514f255d036f3c8378acfa694e9f2c99/scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7MB)\n", - "\u001b[K |████████████████████████████████| 6.7MB 3.0MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Installing collected packages: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n", - "Requirement already satisfied: matplotlib==3.2.2 in /usr/local/lib/python3.7/dist-packages (3.2.2)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.3.1)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.19.5)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.4.7)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (0.10.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib==3.2.2) (1.15.0)\n" - ] - } - ], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install scikit-learn==0.21.3\n", - "!pip install matplotlib==3.2.2\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "n7dE_FbM1lk5" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "QBvvarqE5xWm" - }, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import numpy as np\n", - "import pandas as pd # to work with csv files\n", - "\n", - "# matplotlib imports are used to plot confusion matrices for the classifiers\n", - "import matplotlib as mpl \n", - "import matplotlib.cm as cm \n", - "import matplotlib.pyplot as plt \n", - "\n", - "# import feature extraction methods from sklearn\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction import stop_words\n", - "\n", - "# pre-processing of text\n", - "import string\n", - "import re\n", - "\n", - "# import classifiers from sklearn\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.svm import LinearSVC\n", - "\n", - "# import different metrics to evaluate the classifiers\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "# from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix \n", - "from sklearn import metrics\n", - "\n", - "# import time function from time module to track the training duration\n", - "from time import time" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1giNRemr1lk7" - }, - "source": [ - "### Section 1: Load and explore the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "gBCjEALX5xWj" + }, + "source": [ + "**Overview:** This notebook aims to give you a brief overview of performing text classification using Naive Bayes, Logistic Regression and Support Vector Machines. We will be using a dataset called \"Economic news article tone and relevance\" from [Figure-Eight](https://github.com/practical-nlp/practical-nlp/blob/master/Ch4/Data/Full-Economic-News-DFE-839861.csv) which consists of approximately 8000 news articles, which were tagged as relevant or not relevant to the US Economy. Our goal in this notebook is to explore the process of training and testing text classifiers for this problem, using this data set and two text classification algorithms: Multinomial Naive Bayes and Logistic Regression, implemented in sklearn.\n", + "\n", + "##### Dataset Link: In the a folder called Data in folder Ch4 of this repo\n", + "

\n", + "Let's import few necessary packages before we start our work" + ] }, - "id": "fVD8N_E51lk7", - "outputId": "b5893f5e-1123-43f7-d3a5-2e4fb92bfdc9" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-07-16 08:09:13-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 12383529 (12M) [text/plain]\n", - "Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’\n", - "\n", - "Full-Economic-News- 100%[===================>] 11.81M 22.9MB/s in 0.5s \n", - "\n", - "2021-07-16 08:09:14 (22.9 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]\n", - "\n", - "total 12M\n", - "drwxr-xr-x 2 root root 4.0K Jul 16 08:09 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:09 ..\n", - "-rw-r--r-- 1 root root 12M Jul 16 08:09 Full-Economic-News-DFE-839861.csv\n" - ] - } - ], - "source": [ - "try:\n", - " from google.colab import files\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", - " !ls -lah DATAPATH\n", - " our_data = pd.read_csv(\"DATAPATH/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )\n", - "\n", - "except ModuleNotFoundError:\n", - " our_data = pd.read_csv(\"Data/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mee0VQbBXDto", + "outputId": "7ee35588-1066-4c90-dd1a-f5d30bb13f02" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.10/dist-packages (1.23.5)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: matplotlib==3.7.1 in /usr/local/lib/python3.10/dist-packages (3.7.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (4.42.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.4.4)\n", + "Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib==3.7.1) (1.16.0)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install numpy==1.23.5\n", + "!pip install pandas==1.5.3\n", + "!pip install matplotlib==3.7.1\n", + "!pip install scikit-learn==1.2.2\n", + "\n", + "# ===========================" + ] }, - "id": "LbED8Q185xWu", - "outputId": "2ded8ddf-5553-4f4a-b55f-16454270648d" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(8000, 15)" + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "n7dE_FbM1lk5" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "no 0.821375\n", - "yes 0.177500\n", - "not sure 0.001125\n", - "Name: relevance, dtype: float64" + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "QBvvarqE5xWm" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import numpy as np\n", + "import pandas as pd # to work with csv files\n", + "\n", + "# matplotlib imports are used to plot confusion matrices for the classifiers\n", + "import matplotlib as mpl\n", + "import matplotlib.cm as cm\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# import feature extraction methods from sklearn\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction import _stop_words # This Module has become private after sklearn 0.24 thus stop_words changed to _stop_words\n", + "\n", + "# pre-processing of text\n", + "import string\n", + "import re\n", + "\n", + "# import classifiers from sklearn\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "# import different metrics to evaluate the classifiers\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn import metrics\n", + "\n", + "# import time function from time module to track the training duration\n", + "from time import time" ] - }, - "execution_count": 5, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "display(our_data.shape) # Number of rows (instances) and columns in the dataset\n", - "our_data[\"relevance\"].value_counts()/our_data.shape[0] # Class distribution in the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vCED1t7F5xW9" - }, - "source": [ - "There is an imbalance in the data with **not relevant** being 82% in the dataset. That is, most of the articles are not relevant to US Economy, which makes sense in a real-world scenario, as news articles discuss various topics. We should keep this class imbalance mind when interpreting the classifier performance later. Let us first convert the class labels into binary outcome variables for convenience. 1 for Yes (relevant), and 0 for No (not relevant), and ignore \"Not sure\". " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "BYW_S3585xXF", - "outputId": "b64bb281-6512-43b5-eda9-73d43becb1ae" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(7991, 2)" + "cell_type": "markdown", + "metadata": { + "id": "1giNRemr1lk7" + }, + "source": [ + "### Section 1: Load and explore the dataset" ] - }, - "execution_count": 6, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "# convert label to a numerical variable\n", - "our_data = our_data[our_data.relevance != \"not sure\"] # removing the data where we don't want relevance=\"not sure\".\n", - "our_data.shape\n", - "our_data['relevance'] = our_data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0. \n", - "our_data = our_data[[\"text\",\"relevance\"]] # Let us take only the two columns we need.\n", - "our_data.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fOKz8xQr5xXJ" - }, - "source": [ - "### Section 2: Text Pre-processing" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yhC5TZuL5xXK" - }, - "source": [ - "Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "7MZSHdHZ5xXL" - }, - "outputs": [], - "source": [ - "stopwords = stop_words.ENGLISH_STOP_WORDS\n", - "def clean(doc): # doc is a string of text\n", - " doc = doc.replace(\"
\", \" \") # This text contains a lot of
tags.\n", - " doc = \"\".join([char for char in doc if char not in string.punctuation and not char.isdigit()])\n", - " doc = \" \".join([token for token in doc.split() if token not in stopwords])\n", - " # remove punctuation and numbers\n", - " return doc" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3CfVm42o5xXS" - }, - "source": [ - "### Section 3: Modeling\n", - "\n", - "Now we are ready for the modelling. We are going to use algorithms from sklearn package. We will go through the following steps:\n", - "\n", - "1 Split the data into training and test sets (75% train, 25% test) \n", - "2 Extract features from the training data using CountVectorizer, which is a bag of words feature implementation. We will use the pre-processing function above in conjunction with Count Vectorizer \n", - "3 Transform the test data into the same feature vector as the training data. \n", - "4 Train the classifier \n", - "5 Evaluate the classifier " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "GimJJHhg5xYl", - "outputId": "7ed9cad8-3bd8-416d-a352-4a44fad9dc80" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7991,) (7991,)\n", - "(5993,) (5993,)\n", - "(1998,) (1998,)\n" - ] - } - ], - "source": [ - "import sklearn\n", - "#from sklearn.cross_validation import train_test_split\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Step 1: train-test split\n", - "X = our_data.text # the column text contains textual data to extract features from\n", - "y = our_data.relevance # this is the column we are learning to predict. \n", - "print(X.shape, y.shape)\n", - "# split X and y into training and testing sets. By default, it splits 75% training and 25% test\n", - "# random_state=1 for reproducibility\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n", - "print(X_train.shape, y_train.shape)\n", - "print(X_test.shape, y_test.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "fVD8N_E51lk7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a36f603b-afc2-475e-f6c9-46b3299db3f4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-08-22 16:03:42-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12383529 (12M) [text/plain]\n", + "Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’\n", + "\n", + "Full-Economic-News- 100%[===================>] 11.81M 71.4MB/s in 0.2s \n", + "\n", + "2023-08-22 16:03:43 (71.4 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]\n", + "\n", + "total 12M\n", + "drwxr-xr-x 2 root root 4.0K Aug 22 16:03 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 16:03 ..\n", + "-rw-r--r-- 1 root root 12M Aug 22 16:03 Full-Economic-News-DFE-839861.csv\n" + ] + } + ], + "source": [ + "try:\n", + " from google.colab import files\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + " !ls -lah DATAPATH\n", + " our_data = pd.read_csv(\"DATAPATH/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )\n", + "\n", + "except ModuleNotFoundError:\n", + " our_data = pd.read_csv(\"Data/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )" + ] }, - "id": "gsUyIBUD5xZI", - "outputId": "f4082e6a-a1e9-4b4a-c247-8b1b84c7edae" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(5993, 49753) (1998, 49753)\n" - ] - } - ], - "source": [ - "# Step 2-3: Preprocess and Vectorize train and test data\n", - "vect = CountVectorizer(preprocessor=clean) # instantiate a vectoriezer\n", - "X_train_dtm = vect.fit_transform(X_train)# use it to extract features from training data\n", - "# transform testing data (using training data's features)\n", - "X_test_dtm = vect.transform(X_test)\n", - "print(X_train_dtm.shape, X_test_dtm.shape)\n", - "# i.e., the dimension of our feature vector is 49753!" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 106 + }, + "id": "LbED8Q185xWu", + "outputId": "7672d092-6fda-401a-9651-05e35794a3a0" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "(8000, 15)" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "no 0.821375\n", + "yes 0.177500\n", + "not sure 0.001125\n", + "Name: relevance, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "display(our_data.shape) # Number of rows (instances) and columns in the dataset\n", + "our_data[\"relevance\"].value_counts()/our_data.shape[0] # Class distribution in the dataset" + ] }, - "id": "nDLwA4CL5xZq", - "outputId": "3cb119d8-3017-4ebb-89b9-86dca66e3e92" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 14 ms, sys: 994 µs, total: 14.9 ms\n", - "Wall time: 15.2 ms\n" - ] - } - ], - "source": [ - "# Step 3: Train the classifier and predict for test data\n", - "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", - "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", - "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 + "cell_type": "markdown", + "metadata": { + "id": "vCED1t7F5xW9" + }, + "source": [ + "There is an imbalance in the data with **not relevant** being 82% in the dataset. That is, most of the articles are not relevant to US Economy, which makes sense in a real-world scenario, as news articles discuss various topics. We should keep this class imbalance mind when interpreting the classifier performance later. Let us first convert the class labels into binary outcome variables for convenience. 1 for Yes (relevant), and 0 for No (not relevant), and ignore \"Not sure\"." + ] }, - "id": "LiCHjvc75xZ3", - "outputId": "1409e48f-0ed6-4705-8688-4e6126662863" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.7822822822822822\n", - "ROC_AOC_Score: 0.7251117679464362\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BYW_S3585xXF", + "outputId": "a3e800a7-e175-4308-dbfe-33ef45e4ba85" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(7991, 2)" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# convert label to a numerical variable\n", + "our_data = our_data[our_data.relevance != \"not sure\"] # removing the data where we don't want relevance=\"not sure\".\n", + "our_data.shape\n", + "our_data['relevance'] = our_data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0.\n", + "our_data = our_data[[\"text\",\"relevance\"]] # Let us take only the two columns we need.\n", + "our_data.shape" + ] }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAG7CAYAAAAv5Ie9AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd5wcdf3H8dc7jURCQgmhJEAooXciCtJUwFAkKFIVpAuCShVEiRFsICBIkSIQpRi6BESKP0ABARNCL4EQgilAEkgIgYS0z++PmQubzd3OXu729mbyfvLYx+3MfPc7n90c99lvme8oIjAzM7Ni6VDvAMzMzKz1OcGbmZkVkBO8mZlZATnBm5mZFZATvJmZWQE5wZuZmRWQE7wtNSR1k3SPpA8l3daCer4t6cHWjK1eJO0oaXSNzzFT0joVjo+TtGsrnWthXZKGSLqxQtnjJb2XxrdSa5zfrD1xgrd2R9Ihkkamf3jfkfQPSTu0QtXfAlYBVoqI/Ze0koi4KSJ2b4V4akpSSFqvUpmIeCwiNqhlHBHRPSLGpjENlfTLWp6vGpI6AxcBu6fxvd+Cuvqln3Wn1ovQrOWc4K1dkXQKcDHwa5JkvCZwBTCoFapfC3g9Iua1Ql25t5QnpFWArsDL9Q5ECf8ttlbnXyprNyT1BM4BToiIOyPi44iYGxH3RMTpaZllJF0saVL6uFjSMumxXSRNkHSqpMlp6/+I9NgvgMHAgWnPwFHlXbjlLTFJh0saK+kjSW9J+nbJ/sdLXre9pBFp1/8ISduXHHtU0rmSnkjreVBSrybef0P8Py6Jf19Je0p6XdIHks4qKb+tpCclTU/LXiapS3rs32mx59P3e2BJ/WdIehe4vmFf+pp103NsnW6vLmmKpF0aifUISfeUbL9ROuwhabykLdPnIWk9SccC3wZ+nMZ0T0mVW0p6If0Mb5HUtYnPaF1JD0t6X9JUSTdJWr6xsk2RtD7QMCwxXdLD6f4NJT2UfgajJR1Q8pq9JD0raUb63oaUVPnvkrpmStquit+tRyX9StITwCfAOhnn31PSK+nv0ERJpzXnPdtSKiL88KNdPICBwDygU4Uy5wBPAb2BlYH/AOemx3ZJX38O0BnYk+SP5wrp8SHAjSV1lW/3AwLoBCwLzAA2SI+tBmySPj8ceDx9viIwDTg0fd3B6fZK6fFHgTeB9YFu6fZvm3hvDfEPTuM/BpgC3AwsB2wCzALWTstvA3wxPW8/4FXgpJL6AlivkfrPA5ZJ49kFmFBS5hjgFeBzwAPABU3Eug4wnaSRsDrwdkM96bFpQIfyOIChwC/L6hoH/DetZ8X0fRzXxHnXA3ZL41+ZJLleXFbXro39+5bVs/DfOt1eFhgPHJF+nlsBU4GNSz67zdL3uznwHrBvY3Vl/W6V/F78L/037QT0zDj/O8CO6fMVgK3r/f+rH+3/4Ra8tScrAVOjchf6t4FzImJyREwBfkGSXBvMTY/PjYj7gJnAko4xLwA2ldQtIt6JiMa6c/cC3oiIGyJiXkT8FXgN+HpJmesj4vWImAXcCmxZ4ZxzgV9FxFxgGNALuCQiPkrP/wqwBUBEPBMRT6XnHQdcBexcxXv6eUR8msaziIi4BhgDPE3ypeanjVUSyZj6R+l72Ynky8AkSRumMTwWEQsyYin1h4iYFBEfAPfQxGcUEWMi4qE0/ikk4+hZ77kaewPjIuL69PN8FrgD2D8976MR8WJELIiIF4C/tsJ5h0bEy+nv+8BK5yf5vdhYUo+ImBYRo1p4blsKOMFbe/I+0EuVx4YbWosN3k73Layj7AvCJ0D35gYSER8DBwLHAe9I+nuavLLiaYipT8n2u82I5/2ImJ8+b0jA75Ucn9XweknrS7pX0ruSZpDMW2i0+7/ElIiYnVHmGmBT4NKI+LRCuX+RtGx3Sp8/SpL0dk63m6Oqz0jSKpKGpd3UM4AbyX7P1VgL+EI63DFd0nSSL5Orpuf9gqRH0iGLD0l+L1p63vHVnh/Yj6RH6m1J/5K0XQvPbUsBJ3hrT54EPgX2rVBmEskfwwZrpvuWxMckXdENVi09GBEPRMRuJC3Z10gSX1Y8DTFNXMKYmuOPJHH1j4gewFmAMl5T8faRkrqTTHK8FhgiacUKxRsS/I7p83+RneBbevvKX6d1bJa+5++Q/Z6rMR74V0QsX/LoHhHHp8dvBoYDa0RET+DKkvM29p4q/m418rqK54+IERExiGRo6m8kPUFmFTnBW7sRER+SjD9fnk4u+5ykzpL2kHR+WuyvwM8krZxOVhtM0opbEs8BO0laU8kEv580HEhbioMkLUvypWMmSfd2ufuA9ZVc2tdJ0oHAxsC9SxhTcyxHMk9gZtq7cHzZ8fdIxsOb4xJgZEQcDfydJJE15V/Al4FuETEBeIykq3kl4NkmXrMkMZVajuTf4kNJfYDTW1BXqXtJ/h0PTX/nOkv6vKSNSs77QUTMlrQtcEjJa6eQ/G6Uvq8mf7eae35JXZSsvdAzHbqZQeO/i2aLcIK3diUiLgROAX5G8odzPHAiSasF4JfASOAF4EVgVLpvSc71EHBLWtczLJqUO6RxTAI+IGmVlidQIrl+em/gVJIhhh8De0fE1CWJqZlOI0k0H5H0LtxSdnwI8Oe0y/cAMkgaRJKgG97nKcDWSq8eKBcRr5Mk28fS7RnAWOCJkmGGcteSjCVPl/S3JspU8gtga+BDki8gdy5BHYuJiI+A3YGDSP7N3+WzyYgA3wfOkfQRyZfKW0te+wnwK+CJ9H19MeN3a0nOfygwLh2WOI6k+96sIkW0tMfMzMzM2hu34M3MzArICd7MzKyAnODNzMwKyAnezMysgJbmm020G+rULdRluXqHYbaYrTZas94hmDVp1KhnpkbEyrU+T8cea0XMW2zhx2aJWVMeiIiBrRRSVZzg2wF1WY5lNsi8ismszT3x9GX1DsGsSd06q3wVyZqIebNa/Dd69nOXt8aKi83iBG9mZlaRIId39M1fxGZmZpbJLXgzM7NKBKg1bnnQtpzgzczMsuSwi94J3szMLItb8GZmZkXjSXZmZmbWTrgFb2ZmlsVd9GZmZgUjctlF7wRvZmZWkXLZgs/fVxIzMzPL5Ba8mZlZFnfRm5mZFVAOu+id4M3MzCrydfBmZmbWTrgFb2ZmVolvNmNmZlZQOeyid4I3MzOryGPwZmZm1k64BW9mZpalg8fgzczMisVr0ZuZmRVUDmfR5+8riZmZWZtKJ9m15JF1BmmgpNGSxkg6s5Hja0p6RNKzkl6QtGdWnU7wZmZmdSSpI3A5sAewMXCwpI3Liv0MuDUitgIOAq7IqtcJ3szMLIvUskdl2wJjImJsRMwBhgGDysoE0CN93hOYlFWpx+DNzMyytHySXS9JI0u2r46Iq9PnfYDxJccmAF8oe/0Q4EFJPwCWBXbNOqETvJmZWSXVtcKzTI2IAS14/cHA0Ii4UNJ2wA2SNo2IBU29wF30ZmZm9TURWKNku2+6r9RRwK0AEfEk0BXoValSJ3gzM7MstZ1FPwLoL2ltSV1IJtENLyvzP+CrAJI2IknwUypV6i56MzOzLDW8Dj4i5kk6EXgA6AhcFxEvSzoHGBkRw4FTgWsknUwy4e7wiIhK9TrBm5mZVVT7m81ExH3AfWX7Bpc8fwX4UnPqdBe9mZlZAbkFb2ZmliWHS9U6wZuZmVXim82YmZkVUe3H4GshfxGbmZlZJrfgzczMsngM3szMrIBy2EXvBG9mZpbFLXgzM7OCkSfZmZmZWTvhFryZmVkWd9GbmZkVj5zgzczMikXkM8F7DN7MzKyA3II3MzOrROkjZ5zgzczMKlIuu+id4M3MzDLkMcF7DN7MzKyA3II3MzPLkMcWvBO8mZlZBid4MzOzosnpLHqPwZuZmRWQW/BmZmYVyJfJmZmZFZMTvJmZWQE5wZuZmRVQHhO8J9mZmZkVkFvwZmZmleT0MjkneDMzswx57KJ3gjczM6sgr5fJeQzezMysgNyCNzMzy5DHFrwTvJmZWZb85XcneDMzs4qUzxa8x+DNzMwKyC14MzOzDHlswTvBm5mZZXCCNzMzKxhfB29mZmZLRNJASaMljZF0ZiPHfy/pufTxuqTpWXW6BW9mZpalhg14SR2By4HdgAnACEnDI+KVhjIRcXJJ+R8AW2XV6xa8mZlZJellci15ZNgWGBMRYyNiDjAMGFSh/MHAX7MqdQvezMwsQyuMwfeSNLJk++qIuDp93gcYX3JsAvCFJuJYC1gbeDjrhE7wZmZmGVohwU+NiAGtEMpBwO0RMT+roLvozczM6msisEbJdt90X2MOoorueXCCt5zZbfuNeP6us3np7p9z2hG7LXZ8jVVX4P6rf8iTfz2D/97yE762w8YLj23af3Ue/fOpPHP7Txlx61ks08UdWNZ6HnzgfjbfZAM22XA9fnf+bxc7/vhj/2a7z29N966duPOO2xc7PmPGDNbt15eTfnhiW4RrzaUWPiobAfSXtLakLiRJfPhiIUgbAisAT1YTsv/CWW506CAuPvMA9jr+Mia+N53Hbzqde//1Iq+NfXdhmTOOHsgdD43imtseZ8N1VuVvlx7Phnv9nI4dO3DdL7/LUWf/hRdfn8iKPZdl7rzMHi6zqsyfP5+TfngCf//HQ/Tp25cdvvh59t57Hzba+LMvmGussSZXXzuUiy+6oNE6fvHzs9lhx53aKmRrplpeBx8R8ySdCDwAdASui4iXJZ0DjIyIhmR/EDAsIqKaep3gLTc+v2k/3hw/lXET3wfgtgdGsfcumy+S4COCHst2BaBn9268M+VDAHbdbkNeemMiL76e9Hp98OHHbRy9FdmI//6Xddddj7XXWQeA/Q88iHvvuXuRBL9Wv34AdOiweMfpqGeeYfLk99h994E888zIxY5bfVU5E75FIuI+4L6yfYPLtoc0p0530VturN67JxPem7Zwe+J70+izcs9Fyvzqqvs4aM9tGXP/udx16fGcct5tAPRfszcRMPzyE/jPzWdwynd3bdPYrdgmTZpI376fDaH26dOXiRObGkJd1IIFCzjzx6fym/Mab9mbLak2TfCSQtKFJdunSRqS8Zp9JW3cxLEhkiamK/u8IungKmKY2ezAl5CkwyWt3lbnMzhg4ABuvOcp1ht4Nt/4wR+59peHIYlOHTuy/VbrcMRPh/LVIy9in69swS7brl/vcM246o9X8LU99qRv3771DsUqqPF18DXR1i34T4FvSurVjNfsCzSa4FO/j4gtSRYFuEpS55YE2MoOB5zgW8mkyR/Sd5UVFm73WWUFJqZd8A2+u+923PHgKACefuEtunbpTK/ll2Xi5Ok8PupN3p/+MbNmz+X+x19mqw3XwKw1rL56HyZM+Owy5okTJ9CnT5+qXvv0U09y5RWXscF6/fjJGadx841/4WdnLbZSqdWZE3y2ecDVwMnlByT1k/SwpBck/Z+kNSVtD+wD/C5tpa/bVMUR8QbwCckMQySdLmlEWt8vGntNY2Uk/VbSCSVlhqQ9Dd3TuEZJelHSoJK4X5V0jaSXJT0oqZukbwEDgJvS2Lst6YdmiZEvv816a67MWquvROdOHdn/a1vz90dfWKTM+Hc/YJdtNwBgg7VXoesynZkybSYP/ecVNllvdbp17UzHjh3YcZv1eLVk7N6sJQZ8/vOMGfMG4956izlz5nDbLcPYa+99qnrt0Btu4o2x/2P0mHH85rwLOOQ7h/HLXy8+C9/qrLaz6GuiHmPwlwPfltSzbP+lwJ8jYnPgJuAPEfEfkksFTo+ILSPizaYqlbQ18EZETJa0O9CfZPm/LYFtJO1UVr6pMrcAB5QUPSDdNxv4RkRsDXwZuFCffS3rD1weEZsA04H9IuJ2YCTw7TT2WWXnP1bSSEkjY94ih6wJ8+cv4OTzbuWeK07guTt/xh0PPsurY9/l7OP3Yq+dNwPgzIvu4shvbs/Tt5zJn39zBMcMvgGA6R/N4g83PszjN/6Yp4edyXOvjuf+x1+u59uxAunUqRO/v+Qyvr7X19hys43Yb/8D2HiTTThnyGDuvSeZAD1yxAjW7deXO++4jR98/3tsvcUmdY7aik5VzrZvnZNJMyOiezr1fy4wC+geEUMkTQVWi4i5aTf7OxHRS9JQ4N40YZbXNwQ4hiSprg98PSLul3QB8K10P0B34DcRcW1JDJXKvAp8FVgZuCIivpTG9HtgJ2ABsAHJcoFdgYcion8a0xlA54j4paRHgdMiouK02A6f6x3LbHBApSJmdTFtxGX1DsGsSd0665lWWh2uomVW6R99vn1Ji+p46/d7tUmspep1mdzFwCjg+lao6/cRcYGkfYBr0258kSTrqyq8rlKZ20iS/6okrXeAb5Mk/G3SLyHjSJI7JHMLGswH3B1vZlYUqu118LVSl8vkIuID4FbgqJLd/yG5iB+SZPpY+vwjYLkq6hxO0iX+XZLFAo6U1B1AUh9JvcteUqnMLWks3yJJ9gA9gclpcv8ysFYVb7Wq2M3MrP0SILXsUQ/1vA7+QqB0Nv0PgCMkvQAcCvwo3T8MOF3Ss5Um2aXOAU4B/gncDDwp6UXgdsoSbUQ82FSZiHg5fT4xIt5JX3ITMCAtexjwWhXvcShwpSfZmZlZW2vTMXhrnMfgrb3yGLy1Z201Bt911fVjjUP/0KI6xlywx1IzBm9mZpYbORyCd4I3MzPLksdJdk7wZmZmldRxolxL+GYzZmZmBeQWvJmZWQUCOnTIXxPeCd7MzCxDHrvoneDNzMwy5HGSncfgzczMCsgteDMzs0pyOoveCd7MzKyCZC36/GV4J3gzM7OKlMsE7zF4MzOzAnIL3szMLEMOG/BO8GZmZlny2EXvBG9mZlZJTmfRewzezMysgNyCNzMzq8CXyZmZmRVUDvO7E7yZmVkWt+DNzMwKKIf53ZPszMzMisgteDMzs0rkLnozM7PCSWbR1zuK5nOCNzMzq8g3mzEzM7N2wi14MzOzDDlswDvBm5mZZcljF70TvJmZWSW+2YyZmZktCUkDJY2WNEbSmU2UOUDSK5JelnRzVp1uwZuZmVVQ65vNSOoIXA7sBkwARkgaHhGvlJTpD/wE+FJETJPUO6teJ3gzM7MMNR6D3xYYExFj03MNAwYBr5SUOQa4PCKmAUTE5KxK3UVvZmaWQWrZA+glaWTJ49iS6vsA40u2J6T7Sq0PrC/pCUlPSRqYFbNb8GZmZrU3NSIGtOD1nYD+wC5AX+DfkjaLiOmVXmBmZmYV1LiLfiKwRsl233RfqQnA0xExF3hL0uskCX9EU5W6i97MzKySFnbPV/HdYATQX9LakroABwHDy8r8jaT1jqReJF32YytV6ha8mZlZBarxWvQRMU/SicADQEfguoh4WdI5wMiIGJ4e213SK8B84PSIeL9SvU7wZmZmGWq90E1E3AfcV7ZvcMnzAE5JH1VxF72ZmVkBuQVvZmaWoUMO16p1gjczM8uQw/zuBG9mZlZJMhM+fxneY/BmZmYF5Ba8mZlZhg75a8A7wZuZmWXJYxd9kwle0hQgqq0oIjJvXWdmZpZHOczvFVvwl9OMBG9mZmbtR5MJPiKGtGEcZmZm7ZJIlqvNm2aNwUtaAdiU5K43/4iIaZK6AnMiYkEtAjQzM6u3wk6yk9QJ+DVwAtCNpOv+88A04A5gJPDzGsVoZmZWP6rtzWZqpdrr4H8FHAOcCKwDi/RV3A18vZXjMjMzsxaotov+MODMiLheUseyY2+SJH0zM7NCymEDvuoEvzxJIm9MF5L715qZmRWOyOfNZqrton8JGNTEsT2AUa0TjpmZWfuTrEe/5I96qLYF/0vgDkndgNtIJtltKekbwPeAfWoUn5mZWd0VdpJdRNwNHALsCvyDpMfiT8DhwKER8UCtAjQzM7Pmq/o6+Ii4FbhV0gbASsAHwOiI8Gp3ZmZWWPXsZm+JZt9sJiJG1yIQMzOz9qrIk+yQtJmkmyWNkfRx+vNmSZvXMkAzM7N6Uwsf9VDtSnb7AreSXCp3OzAZ6E0ys36kpAMi4m81i9LMzMyapdou+vNIVqw7oHTMXdJPSGbVnwc4wZuZWSEVdhY9yc1l/lQ+oS7dviY9bmZmVjjJQjcte9RDtQl+JLBJE8c2xQvdmJlZUaU3m2nJox6a7KKX9LmSzVOAYZI6k3TFN4zBfwM4GjiolkGamZlZ81Qag59JsmJdAwG/IbltbOk+gKfxevRmZlZQORyCr5jgj2TRBG9mZrZUyuMkuyYTfEQMbcM4zMzM2qWGSXZ5U/VCN2ZmZpYfVS9VK+lA4BhgfaBr+fGI6N2KcZmZmbUbeeyir6oFL+kQ4M/AGKAvMBy4N339DOCyWgVoZmZWb3lcqrbaLvrTgXOBE9LtKyLiSGBtYCrwSQ1iMzMzqzspudlMSx71UG2C7w88ERHzgflAD4CI+IhkmdoTaxOemZlZ/TXcMnZJH/VQbYKfASyTPp8IbFRyTCT3hzczM7N2otpJdiOAzYEHSMbfB0uaB8wBBgNP1SY8MzOz+svjJLtqE/xvgLXS54PT538k6QEYAXyv9UMzMzNrH3KY36tL8BHxFGkrPSKmA4MkLQMsExEzahifmZlZXYn6TZRriSVe6CYiPnVyNzMzazlJAyWNljRG0pmNHD9c0hRJz6WPo7PqrHQ3ufObEVtExBnNKG9mZpYPNZ4JL6kjcDmwGzABGCFpeES8Ulb0loio+qq1Sl30+zcjvgCc4JdQ914r8cWjv1PvMMwWc8PIt+sdglm7UONJdtsCYyJibHquYcAgoDzBN0ulm82s3ZKKzczMiqIVbtzSS9LIku2rI+Lq9HkfYHzJsQnAFxqpYz9JOwGvAydHxPhGyixU9Vr0ZmZmtsSmRsSAFrz+HuCvEfGppO+RLB//lUov8N3kzMzMKhBJF31LHhkmAmuUbPdN9y0UEe9HxKfp5p+AbbIqdYI3MzPL0EEte2QYAfSXtLakLsBBJIvKLSRptZLNfYBXsyp1F72ZmVmGKpL0EouIeZJOJFkttiNwXUS8LOkcYGREDAd+KGkfYB7wAXB4Vr1O8GZmZnUWEfcB95XtG1zy/CfAT5pTZ7MSvJKBhL4kYwXPR8THzXm9mZlZ3iR3hCvwSnaSvk8y6P828BiwQbr/Tkkn1SY8MzOz+qvxGHxtYq6mkKTTgYuAa0im5ZeG+yhwYKtHZmZm1k7k8X7w1XbRnwAMjojz0yX1So0G1m/dsMzMzKwlqk3wqwLPNHFsAdC1dcIxMzNrXwSFvpvcGGDnJo7tRAvXyzUzM2vPOrTwUQ/VtuAvBq6QNAe4Pd3XW9JRwCnAMbUIzszMrD3IYQO+ugQfEX+StAIwGPhFuvs+4BNgSETcXKP4zMzM6kpSLrvoq74OPiJ+J+lKYHtgJZKVdJ6MiA9rFZyZmZktmWYtdBMRH5EspWdmZrbUyGEDvroEny5yU1FEXNHycMzMzNqfei1W0xLVtuAvq3As0p9O8GZmVjiFvkwuIjqUP4AVgYOB54GNaxmkmZmZNc8S300uIqYDt0jqCVwF7NJaQZmZmbUnOWzAt8rtYt8CBrRCPWZmZu1PHW8Y0xItSvCSVgNOJUnyZmZmhSTyl+GrnUU/hc8m0zXoAiwHzAa+2cpxmZmZWQu0ZBb9bGACcH9EvN96IZmZmbUfySz6ekfRfJkJXlJn4J/AWxExqfYhmZmZtS95TPDVXCY3H3gY2LDGsZiZmbVLklr0qIfMBB8RC4A3SO4Jb2ZmZjlQ7Rj8T4HzJL0YES/WMiAzM7P2pHBj8JJ2AkZFxEzgZyR3kHtO0kTgPcpm1UfEtrUM1MzMrC5UvIVuHgG2A/4LvJQ+zMzMljp5XIu+UoJf+G4i4og2iMXMzKzdyWsXfVU3mzEzM7N8yZpkt6ekqi6Pi4i/tEI8ZmZm7U4Oe+gzE/zgKusJwAnezMwKSHQo4Fr0XwZGtkUgZmZm7ZEoZgt+VkR83CaRmJmZWatpjfvBm5mZFdfSeD94MzOzpUGhroOPCF9CZ2ZmS728jsE7iZuZmRWQu+jNzMwyFKqL3szMzBI5zO9O8GZmZpWIfI5n5zFmMzOzQpE0UNJoSWMknVmh3H6SQtKArDrdgjczM6tEoBr20UvqCFwO7AZMAEZIGh4Rr5SVWw74EfB0NfW6BW9mZpZBLXxk2BYYExFjI2IOMAwY1Ei5c4HzgNnVxOwEb2ZmVkFyP3i16JGhDzC+ZHtCuu+zGKStgTUi4u/Vxu0uejMzswyt0EHfS1Lpzduujoirqzq31AG4CDi8OSd0gjczM6u9qRHR1MS4icAaJdt9030NlgM2BR5N5wKsCgyXtE9ENHnHVyd4MzOzDDW+Dn4E0F/S2iSJ/SDgkIaDEfEh0OuzWPQocFql5A5O8GZmZhlU01n0ETFP0onAA0BH4LqIeFnSOcDIiBi+JPU6wZuZmVXQFgvdRMR9wH1l+wY3UXaXaur0LHozM7MCcgvezMwsQy276GvFCd7MzCxD/tK7E7yZmVllNV6qtlY8Bm9mZlZAbsGbmZlVkNfbxTrBm5mZZchjF70TvJmZWYb8pfd89jqYmZlZBrfgzczMMuSwh94J3szMrJJkkl3+MrwTvJmZWQa34M3MzApHKIcteE+yMzMzKyC34M3MzDK4i97MzKxgPMnOzMysiJTPFrzH4M3MzArILXgzM7MMeWzBO8GbmZllyONlck7wZmZmFQjokL/87jF4MzOzInIL3szMLIO76M3MzArIk+zMamzAmj05fod+dOgg7n9lMreMmrTI8d02XJljtl+T9z+eA8DdL7zL/a9OAeDo7dZk237L0wExasJ0rnjs7TaP34rrpScf5ZaLz2HB/PnssM+B7HHY9xst98wj/+Cqs47nrOuG02+jzXnlv49x5xXnMW/uXDp17sy3TjyLDQds38bRWxa34M1qqIPgxJ3W5szhrzJ15hwu3X9TnnxrGv+bNmuRcv96430uf2zcIvs2XrU7m6y2HMcNewGAi765CZuv3oMXJs1oq/CtwBbMn8/NFw7m5EtuZIXeq/LrI/dhix13Y/W1+y9SbvbHM3n41utZe5MtF+7r3nMFTvzdtSy/8ipMfHM0l5x0GOff83RbvwUrIE+ys9zYoHd3Jn04m3dnfHadwIEAABlCSURBVMq8BcG/3nif7ddeoarXRkCXjqJTB9G5Ywc6dRDTZs2pccS2tHjrlefo3XctVu6zJp06d+Hzu36d5//94GLl7r76Qr72nePo3GWZhfvW3GBTll95FQBWX2d95nw6m7lzPm2z2C1bwyz6ljzqwQnecqNX9y5MmflZUp4ycw4rLdtlsXI7rLsiVx64GWd/rT8rd0+Ov/reTJ6bOINhR2zDsMO3ZuT/PmT8tNltFrsV2/Qp77Fi79UXbi/fezWmTXlvkTJvj36JDya/w+Zf+kqT9Yx65B+sucGmi3wBsPZALf6vHnLfRS9pPvAiyXt5Czg0IqZXKD8EmBkRF7RBbP2A7SPi5lqfyxJPvTWNR1+fytwFwV6b9Ob0r67Lj+9+ldV7LsOaK3TjkD+PAuC3+2zEpv9bjpfe+ajOEdvSYMGCBdx2ybkcfnbTf3YmjX2dO674LSddfEMbRmZV8Vr0dTMrIraMiE2BD4AT6h1QiX7AIfUOoiimzpyzsEUOsHL3Lgsn0zX46NN5zF0QAPzjlcn0X3lZAL60zoq89t5MZs9dwOy5Cxjx9nQ2WrV72wVvhbb8yqvwweTPJnxOn/wOK6Td7gCzP5nJxLGvc+H3D+In3/gSY19+lst/fDTjXk3mhEyb/A5XnPk9jjz7Inr3XavN47dsauGjHoqQ4Es9CfQBkLSupPslPSPpMUkblhdurIyknpLeltQhLbOspPGSOks6RtIISc9LukPS59IyQyX9QdJ/JI2V9K30FL8FdpT0nKST2+gzKKzRk2fSp2dXVl1uGTp1EDv3X4knx01bpMyKn+u88Pl2/VZYOAFv8kdz2Gz1HnQQdOwgNu/Tg/Flk/PMllS/jbZg8vhxTJ00nnlz5zDin/ewxY67LTz+ue49+P39z/Kbu57gN3c9wTqbbMUJ5/+Jfhttzicffcilpx7BN79/ButtMaCO78KKJvdd9A0kdQS+Clyb7roaOC4i3pD0BeAKoHzwa7EyEfEVSc8BOwOPAHsDD0TEXEl3RsQ16fl+CRwFXJrWtRqwA7AhMBy4HTgTOC0i9q7Nu166LAi47LFx/HqfDekg8cCrk3n7g1kctm1fXp/8MU+Nm8a+m6/KF9degfkLgo9mz+OC/3sTgMfefJ8t+/bg6oO2IAhG/u9DnhrX5EiOWbN07NSJg089h4tPOowFC+bzpb0PYPV11ufuqy9irY02Y8uSZF/ukdv/wuQJb3PvdZdw73WXAHDSxTfQY8VebRW+ZUgm2eWvj14RUe8YWqRkDL4P8CrwZaAbMAUYXVJ0mYjYqGEMHriyQplDgJ0i4jhJd5Ek/ock7Qz8Elge6E6S+I+TNBR4KCJuSmP6KCKWk7QLTSR4SccCxwJ0XWHVbXY8965W+kTMWs/+26xW7xDMmnTsdv2eiYiad3tstNlWcf1dj7Soju36r9AmsZYqQgt+VkRsmXaXP0AyBj8UmB4RW1Z4XYcKZYYDv5a0IrAN8HC6fyiwb0Q8L+lwYJeS15Re15L5VS8iribpQaDHmhvl+1uWmVnR5a8BX5wx+Ij4BPghcCrwCfCWpP0BlNiirPyMpspExExgBHAJcG9EzE9fthzwjqTOwLerCOuj9DVmZmZtqjAJHiAingVeAA4mScBHSXoeeBkY1MhLKpW5BfhO+rPB2cDTwBPAa1WE9AIwP52U50l2ZmY55evg6yAiupdtf71kc2Aj5YeUPH+rsTLpsdsp65SJiD8Cf2yk7OGNxRQRc1l8Yp+ZmeVMDufY5T/Bm5mZ1VoO83uxuujNzMzySNJASaMljZF0ZiPHj5P0YrquyuOSNs6q0wnezMwsSw2XskvXcbkc2APYGDi4kQR+c0Rsll75dT5wUVbITvBmZmYVJDm6ppPstgXGRMTYiJgDDKNsYnh65VeDZYHMy6s9Bm9mZlZJ7W820wcYX7I9AfjCYmFIJwCnAF2oYgK3W/BmZma110vSyJLHsc2tICIuj4h1gTOAn2WVdwvezMwsQys04KdWWKp2IrBGyXbfdF9ThtHIJdvl3II3MzPLUtv7xY4A+ktaW1IX4CCSJdM/O73Uv2RzL+CNrErdgjczM6uotqvRRcQ8SSeS3E+lI3BdRLws6RxgZEQMB06UtCswF5gGfDerXid4MzOzDLVeyS4i7gPuK9s3uOT5j5pbp7vozczMCsgteDMzswqqG0Zvf5zgzczMsuQwwzvBm5mZZajXLV9bwmPwZmZmBeQWvJmZWQbfD97MzKyAcpjfneDNzMwqyuk0eo/Bm5mZFZBb8GZmZhnyOIveCd7MzKwC4Ul2ZmZmhZTD/O4xeDMzsyJyC97MzCxLDpvwTvBmZmYZPMnOzMysgDzJzszMrIBymN89yc7MzKyI3II3MzPLksMmvBO8mZlZBclS9PnL8E7wZmZmlSifk+w8Bm9mZlZAbsGbmZllyGED3gnezMwsUw4zvBO8mZlZRcrlJDuPwZuZmRWQW/BmZmYZ8jiL3gnezMysApHLIXgneDMzs0w5zPAegzczMysgt+DNzMwy5HEWvRO8mZlZBk+yMzMzK6Ac5ncneDMzs4p8sxkzMzNrL9yCNzMzy5S/JrwTvJmZWQUin130TvBmZmYZcpjfPQZvZmZWb5IGShotaYykMxs5foqkVyS9IOn/JK2VVacTvJmZWQapZY/KdasjcDmwB7AxcLCkjcuKPQsMiIjNgduB87NidoI3MzPLoBb+l2FbYExEjI2IOcAwYFBpgYh4JCI+STefAvpmVeoEb2ZmlkUtfEAvSSNLHseW1N4HGF+yPSHd15SjgH9khexJdmZmZrU3NSIGtLQSSd8BBgA7Z5V1gjczM8tQ41n0E4E1Srb7pvsWjUHaFfgpsHNEfJpVqRO8mZlZBdVMlGuhEUB/SWuTJPaDgEMWjUFbAVcBAyNicjWVOsGbmZllqOXtYiNinqQTgQeAjsB1EfGypHOAkRExHPgd0B24Tcm3jf9FxD6V6nWCNzMzq7OIuA+4r2zf4JLnuza3Tid4MzOzLDlcys4J3szMLEMO87sTvJmZWRbfbMbMzKxwqlqNrt3xSnZmZmYF5Ba8mZlZBXm9H7xb8GZmZgXkFryZmVkGt+DNzMysXXAL3szMLEMeZ9E7wZuZmVVS+5vN1IQTvJmZWQUinyvZeQzezMysgNyCNzMzy5LDJrwTvJmZWQZPsjMzMyugPE6y8xi8mZlZAbkFb2ZmliGHDXgneDMzs0w5zPBO8GZmZhk8yc7MzKxg8nq7WEVEvWNY6kmaArxd7zgKpBcwtd5BmDXBv5+tZ62IWLnWJ5F0P8m/W0tMjYiBrRFPtZzgrXAkjYyIAfWOw6wx/v20tuLL5MzMzArICd7MzKyAnOCtiK6udwBmFfj309qEx+DNzMwKyC14MzOzAnKCNzMzKyAneLMmSMnSFg0/zczyxAnerBGSluWz/z961DMWM7Ml4aVqzcpI6gIcAEyWtCmwnaT9gAXhWanWBiQtGxEfp8+Xj4jp9Y7J8scJ3qxMRMyRNAq4B5gLfD0i5tc5LFtKSOoGDJI0FVgB6Cvp0oiYU+fQLGec4M1SktTQQo+I5yX9FdgJ2EzSpNJWVGlZs1Y2F3gTuAHoDGyWfunsEBEL6hua5YnH4M1YNGFL2lFSH+As4CjgeGD/9Ng+ktZ0crfW1jCZMyLmAR8CH5Mk+t3S/U7u1ixe6MaWemXJ/RjgZ8BLwAjgj8DqwEXA68B+wHYR8UadwrUCKvsdXBV4LyJC0leAU4DbI2KopG2AyRExvp7xWj44wZulJO0PfAU4DdgY+DqwDHABSVfplsCrEfFW3YK0wilL7qeTfInsBPwqIu6S9A2SXqT3gVWBgyPi3boFbLnhMXgzFk5sOhJYL529PEJSB2AvYDBwaUTcV88YrZhKkvuXgF2AQcAA4OY0998laTxwGHCOk7tVyy14Wyo1NklO0srAXcAbEXFEum8HYGfg6oiY0vaR2tJA0ueBIcC7EXFUum9Pkol2P4yIm+oYnuWUE7wtdcq6RI8GVgI+jojLJPUG/kQyznl0WqZrRMyuX8RWNOVfMNPeolOB7YFLgKciYrakQcClwKbATE+0s+ZwgrellqQfAAcCJwNPAL+LiJ+mLfnbgZci4gRfEmetqewL5n7AfOCt9NLMM4G1gWEkSX5W6aI3Zs3hBG9LpZKW+pHAwSTjnr2AJyPi+DTJd/VsZasVSacA+wL/BL4I3BwRN0o6DdgauDIi/u0vmLakfB28LXXSyUzLAIcAmwMHRMSuJJOYvifp1IiY4uRurSnthm94/gVgp4jYiWSycw9goKTDIuIC4GmSyzJxcrcl5QRvSxVJKwFHAP0jYma6e5KkTsA6wHUkE+3MWlXD+LmkdSLiaeAkSQeRrJa4FzABOE3SdyPiEs+Wt5ZygrelSkS8D7wI/FbSCsA4YBpwJ3A+cF5EjK1fhFZUSqwOPC3pixExjuS69qER8SHwLnAv8FAdw7QC8Ri8LRXSLtHVIuJv6fbvgOfTMc/+wMoklyg5uVuraRg/L5tYdxIwJyKukHQoyVyQS0juYLhrRIypY8hWIG7BWyE1rOudPu8EDAT2l/Q3SesCH5CsVkdEvBER/3Fyt9ZWMn6+c8nuF4D90tnxN5CsXDcG+JqTu7Umt+CtcMpaS7sAM0iW+RxPsuzsp8CGJDPnD4mIYXUK1QqqtOUOdAEeI5k0dz9wC8nqiCsDJ/hWxFYrXqrWCqPhj2rZut57kdyRqw9wSkScIqkfyVKgfUhuKGPWasoua1sfeAf4ArA3yYS600hWqBtAssjS5HrEacXnBG9FsgwwW1JHku737SNiF0k/B1YB3pDUMZ3cNE7S3yNiVh3jtQIq+YL5A+BbwGvAyhHxTeCedOb87sA2JDcxMqsJj8Fb7qWzk9cF3k4vQZoPfAKMl3Q5SevpmxExF9hD0nLpS738rLWa9IZFDc/3JFnEZm+S4aHl0i+epENCJ5Hc2GhiPWK1pYMTvOVe2iv/JnAt8KiktdPtZUkWsjk2IuZIOgo4B+ja8Lq6BW2FImkD4GRJ66W7pgOXAUeRdMXvFRHzJe0GEBEzImJGfaK1pYUn2VmupZOYVLKIyC+AY/hsjP17JKuE/Q/Yg2TVupfrFK4VlKSvkHTHjyOZRLci8DAwOiK+mJY5HPga8D0nd2sLTvCWW2Wz5ddoWFpW0lnA90nW854L7AZ0Ax5PW/ZmraLsd/DLwD7AFOAPJJdmXkxyM6P1SK5zPzQiXqpTuLaUcYK33EsXDtmRpFv04Yi4KW3JHwZ81de3Wy00dhMYSVuQ9CBNAq4AdgC+mh6+KiJea9sobWnmBG+5JmlfkglLXwUeB56LiOPTY78FvkFyL+35vpe21YKkE0nuY9ADaLhi41CSpWevjwhfBmd14QRvuSKpJ9AxIj5It48CZpFMqPsW8PV0Qt0aETFeUq+ImFrHkK1Aylvtko4nWYnuWOAO4N8R8SNJO5H0IL1MMtlunid1WlvzdfCWG5L2JlkkZCVJV0bE5SSTms4HZkTEl9NypwDrSvoRySVKZq2lC8lKiA1WIRlbP4KkW/50SZ1JVq77GJiQXp5p1uac4C0XJA0EfkXSUuoBXCdpLMlKdM8Db0nai2T28ndIJjPNq1e8VjySdgeOl/Qc8FJE3AGsTrL87BhgUETMS7vs50TE1XUM18zXwVv7J6k3yUzkRyPi6Yh4iM9mJn9Asq73+8BBJJchHeZL4aw1pV8wzwX+SfJ3cw9JKwIXAqsBz6bJ/XCSKzgerVOoZgt5DN5yIf3DuTHJdcXXSrqJ5DK47sClJC2miyV1dpeotaY0kU8laaHfI6kvSW/SNRHxuKRNgKEk4+39gWMi4pW6BWyWcoK3dq3sOuPDgK2ATYAFwCHAlsC2JDfxOAaY5MlM1trS4Z/zge0iYoakvwM9gVHAf4EnSed7RMT0ugVqVsIJ3tq9siR/IMkY+70RcVVjZcxqQdIeJAvY3E8yPHQ10Jvki+VzwMkR8VH9IjRblBO8tTvpTTvmpmOaXSNidlmSPxTYAphI0k06s57x2tJD0q7Ag8BqEfFeuq8DsKIvx7T2xrPorV2R1J1k0ZoJ6R/TjpJ+GxELSu73foOkZUhaUf4dtjYTEf9Mu+sfkbRLRExOF1Bycrd2x38crV2JiJmSegDXk/x+fqthBbqIiJIk/ydJPXzTDmtrEfEPSV2A+yUN8AqJ1l65i97ahbIu+JVIEvxc4Pck1xxPb6ysWb1I6u7hIWvPfB281V1Zcl8f6EyyOthfSW73ukN6bKu01e7kbnXn5G7tnVvw1m5I+j5wFDAaWAHYl2Sd792AeenP7SLinboFaWaWEx6Dt7qRtFzDZUWSdiRZhnZfkjW9f0+ynvd26faGwEVO7mZm1XEXvdWFpHWBsyV9Pt01HXgyIsaRXCJ3AjAW+EZEPBoRV0bEq3UK18wsd5zgrV56kqxG9w1JW5KsAra7pL1LxtgnkdwG1szMmslj8NamJC3fMCM+XcP7IKAbcAHJde13kdzAoyPJ+PtBEfF6ncI1M8stt+CtzaQL1/xX0iVp1/wHwOXATOBHJLfc3I2kZb8c8G0ndzOzJeMWvLWZtCv+KWAOcBZJUj+PZALdFJJ1vS+OiPF1C9LMrCA8i97aTEQ8J2lr4F/ADGB34MvANiRj8lsCHSSdQTLRzt8+zcyWkFvw1ubS7vl/Aj+KiKGSOpLcPGZ34G7PljczazkneKuLNMk/CPw0Iq6odzxmZkXjLnqri4gYkU66GyFpdkRcV++YzMyKxC14qytJWwGfRMToesdiZlYkTvBmZmYF5OvgzczMCsgJ3szMrICc4M3MzArICd7MzKyAnODNzMwKyAnerBVIGiIpSh6TJN2R3ve+VufcOz1Xv3S7X7q9dzPqOEDS4a0YU/c0hibrXJI409cNlTSyxUEmdT0q6fbWqMusvfJCN2at50NgYPp8HeBc4P8kbRIRH7fB+d8BtgNea8ZrDgB6AUNrEZCZ1Y8TvFnrmRcRT6XPn5L0P+AxYE/gtvLCkrpFxKzWOnlEfEpytz4zM3fRm9XQM+nPfgCSxkm6UNLZkiaQ3FEPSR0knSlpjKRPJb0u6bulFSkxRNJkSR9J+gvQo6xMo13fko6R9KKk2ZLek3S7pJ6ShgL7ATuXDC0MKXndIEkj09e9K+l8SZ3L6t4vjXeWpH+T3Pq32SQdJulxSR9ImibpEUkDmii7r6TX0rgel7Rx2fHMz9NsaeAWvFnt9Et/vluy7xDgZeD7fPb/36XAd4FzgFHAbsB1kt6PiHvTMj8EBgO/JukV+CZwflYAkn6W1nsFcDrwOWAvoDvJEMKawPJpPAAT0tcdAPwVuAo4C1gX+A1Jo+C0tMzWwC3AXcCPgE2BW7NiakI/4C/Am0AX4GDgsXR4Y2xJubWAi4CzgVnAL4AHJPWPiNlpmWo+T7Piiwg//PCjhQ9gCDCVJGl3AtYHHiFppa+WlhlHMk7eteR16wELgO+W1fcXYET6vCMwCfhjWZmHgAD6pdv90u290+3lgU+AiyrEfTvwaNk+AW8D15ftP5Ikqa6Ubt8KvEK65HW676dpDIdXOOcicTZyvEP6Gb4GDC7ZPzR93fYl+9YC5gHHVft5ptuPArfX+/fGDz9q+XAXvVnrWQmYmz5Gk0y0OzAi3ikp83/xWUsT4KskCekuSZ0aHsD/AVtK6gisAawG3F12vjsz4tkO6AZc38z3sT5Jy/7WspgeBrqStNQBtgWGR0TpDS2yYmqUpI0k3SXpPWA+yWe4QRpLqckR8Z+GjYh4m2QoZNt0VzWfp9lSwV30Zq3nQ2BXklbmu8CksuQH8F7Zdi+SFvqHTdS5GrBq+nxy2bHy7XIrpT/fqVhqcb3Sn/c1cXyN9OeqSxDTYiQtBzxI8tmcQtJ7MBv4E8kXiqz6J5N8TlDd5zmhuTGa5ZETvFnrmRcRWddplyf8D0i6mL9E0vIsN5nP/j/tXXasfLvc++nP1UiGD6r1QfrzWODZRo6/lf58dwliasx2QF9gt4hYeImfpJ6NlG2s/t4k8xqgus/TbKngBG9WXw+TtDh7RsRDjRWQNJ4kmQ4C7i859M2Mup8kGTP/LunEuEbMYfFW8mhgIsnY/jUV6h8B7CPpJyU9FVkxNaZb+vPThh2SticZq3+mrGxvSds3dNNLWhPYms+GITI/T7OlhRO8WR1FxGhJVwLDJJ0PjCRJuJsA60fE0RExPz12gaSpJLPo9wM2yqh7uqRzgV9J6kLS5b4MySz6X0TERJKJbIMk7UvSdT0pIiZJOhW4QVIP4B8kXwTWAfYFvhURnwDnAU+TjNVfSzI2f9QSfAxPATOBa9L32Zdk0uLERspOBW5Mrw5omEU/mXShnmo+zyWIzyyXPMnOrP5OILlk7TCSJDyUJAn/u6TMxSSXyB0H3EFymduPsyqOiN8Ax5PMDbib5LK35YGP0iJXkIx/X0fSIj82fd0tJD0GW5Is0nMnyaV0o0iSPelwxEHAVsDfSJL/gc198xHxHrA/yZj+3cBJ6fsc00jxt0l6I4YAw9L38bWyiYvVfJ5mhafF5wCZmZlZ3rkFb2ZmVkBO8GZmZgXkBG9mZlZATvBmZmYF5ARvZmZWQE7wZmZmBeQEb2ZmVkBO8GZmZgX0/5b60JVQ3WidAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "cell_type": "code", + "source": [], + "metadata": { + "id": "JAumX0BC2lFK" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fOKz8xQr5xXJ" + }, + "source": [ + "### Section 2: Text Pre-processing" ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "# Step 4: Evaluate the classifier using various measures\n", - "\n", - "# Function to plot confusion matrix. \n", - "# Ref:http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", - "import itertools\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "def plot_confusion_matrix(cm, classes,\n", - " normalize=False,\n", - " title='Confusion matrix',\n", - " cmap=plt.cm.Blues):\n", - " \"\"\"\n", - " This function prints and plots the confusion matrix.\n", - " Normalization can be applied by setting `normalize=True`.\n", - " \"\"\"\n", - " if normalize:\n", - " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", - "\n", - " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", - " plt.title(title)\n", - " plt.colorbar()\n", - " tick_marks = np.arange(len(classes))\n", - " plt.xticks(tick_marks, classes, rotation=45)\n", - " plt.yticks(tick_marks, classes)\n", - "\n", - " fmt = '.2f' if normalize else 'd'\n", - " thresh = cm.max() / 2.\n", - " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", - " plt.text(j, i, format(cm[i, j], fmt),\n", - " horizontalalignment=\"center\",\n", - " color=\"white\" if cm[i, j] > thresh else \"black\")\n", - "\n", - " plt.tight_layout()\n", - " plt.ylabel('True label',fontsize=15)\n", - " plt.xlabel('Predicted label',fontsize=15)\n", - " \n", - " \n", - "# Print accuracy:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "\n", - " \n", - "# print the confusion matrix\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with all features')\n", - "\n", - "# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:\n", - "# Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic\n", - "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", - "print(\"ROC_AOC_Score: \", roc_auc_score(y_test, y_pred_prob))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ga5-KhYN5xaD" - }, - "source": [ - "At this point, we can notice that the classifier is doing poorly with identifying relevant articles, while it is doing well with non-relevant ones. Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum. " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 511 }, - "id": "ylOI4OsD5xaE", - "outputId": "0aea4279-84d2-49d3-e979-30e7c911f814" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 7.05 ms, sys: 7 µs, total: 7.06 ms\n", - "Wall time: 7.13 ms\n", - "Accuracy: 0.6876876876876877\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "yhC5TZuL5xXK" + }, + "source": [ + "Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task." + ] }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "7MZSHdHZ5xXL" + }, + "outputs": [], + "source": [ + "stopwords = _stop_words.ENGLISH_STOP_WORDS\n", + "def clean(doc): # doc is a string of text\n", + " doc = doc.replace(\"
\", \" \") # This text contains a lot of
tags.\n", + " doc = \"\".join([char for char in doc if char not in string.punctuation and not char.isdigit()])\n", + " doc = \" \".join([token for token in doc.split() if token not in stopwords])\n", + " # remove punctuation and numbers\n", + " return doc" ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "vect = CountVectorizer(preprocessor=clean, max_features=5000) # Step-1\n", - "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", - "X_test_dtm = vect.transform(X_test)\n", - "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", - "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", - "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm\n", - "print(\"Accuracy: \", metrics.accuracy_score(y_test, y_pred_class))\n", - "# print the confusion matrix\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with max 5000 features')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2JzJ6k7g5xaL" - }, - "source": [ - "Clearly, the performance on relevance classification got better even though the overall accuracy fell by 10%. Let us try another classification algorithm and see if the performance changes. For this experiment, we have considered logistic regression, with class_weight attribute as \"balanced\", to address the problem of class imbalance in this dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 }, - "id": "0v7pM9hB5xbA", - "outputId": "292bdf0c-924b-494b-ffae-c4914f2f5db9" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.7377377377377378\n", - "AUC: 0.7251117679464362\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "3CfVm42o5xXS" + }, + "source": [ + "### Section 3: Modeling\n", + "\n", + "Now we are ready for the modelling. We are going to use algorithms from sklearn package. We will go through the following steps:\n", + "\n", + "1 Split the data into training and test sets (75% train, 25% test) \n", + "2 Extract features from the training data using CountVectorizer, which is a bag of words feature implementation. We will use the pre-processing function above in conjunction with Count Vectorizer \n", + "3 Transform the test data into the same feature vector as the training data. \n", + "4 Train the classifier \n", + "5 Evaluate the classifier " + ] }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GimJJHhg5xYl", + "outputId": "48f5d9f9-b0e3-4e65-b6cc-13cc21f874a1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7991,) (7991,)\n", + "(5993,) (5993,)\n", + "(1998,) (1998,)\n" + ] + } + ], + "source": [ + "import sklearn\n", + "#from sklearn.cross_validation import train_test_split\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Step 1: train-test split\n", + "X = our_data.text # the column text contains textual data to extract features from\n", + "y = our_data.relevance # this is the column we are learning to predict.\n", + "print(X.shape, y.shape)\n", + "# split X and y into training and testing sets. By default, it splits 75% training and 25% test\n", + "# random_state=1 for reproducibility\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n", + "print(X_train.shape, y_train.shape)\n", + "print(X_test.shape, y_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gsUyIBUD5xZI", + "outputId": "6e17b2c2-d0ea-453a-e42e-308f33ed5bd2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(5993, 49753) (1998, 49753)\n" + ] + } + ], + "source": [ + "# Step 2-3: Preprocess and Vectorize train and test data\n", + "vect = CountVectorizer(preprocessor=clean) # instantiate a vectoriezer\n", + "X_train_dtm = vect.fit_transform(X_train)# use it to extract features from training data\n", + "# transform testing data (using training data's features)\n", + "X_test_dtm = vect.transform(X_test)\n", + "print(X_train_dtm.shape, X_test_dtm.shape)\n", + "# i.e., the dimension of our feature vector is 49753!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nDLwA4CL5xZq", + "outputId": "c374e0f2-2026-497d-b2c8-12ad9289b865" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 13.6 ms, sys: 0 ns, total: 13.6 ms\n", + "Wall time: 81.6 ms\n" + ] + } + ], + "source": [ + "# Step 3: Train the classifier and predict for test data\n", + "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", + "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", + "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "LiCHjvc75xZ3", + "outputId": "db90135e-8645-4e2a-f2d3-5d3f3c2ecaa4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.7822822822822822\n", + "ROC_AOC_Score: 0.7251117679464362\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "# Step 4: Evaluate the classifier using various measures\n", + "\n", + "# Function to plot confusion matrix.\n", + "# Ref:http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + "import itertools\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "def plot_confusion_matrix(cm, classes,\n", + " normalize=False,\n", + " title='Confusion matrix',\n", + " cmap=plt.cm.Blues):\n", + " \"\"\"\n", + " This function prints and plots the confusion matrix.\n", + " Normalization can be applied by setting `normalize=True`.\n", + " \"\"\"\n", + " if normalize:\n", + " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + " tick_marks = np.arange(len(classes))\n", + " plt.xticks(tick_marks, classes, rotation=45)\n", + " plt.yticks(tick_marks, classes)\n", + "\n", + " fmt = '.2f' if normalize else 'd'\n", + " thresh = cm.max() / 2.\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " plt.text(j, i, format(cm[i, j], fmt),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + "\n", + " plt.tight_layout()\n", + " plt.ylabel('True label',fontsize=15)\n", + " plt.xlabel('Predicted label',fontsize=15)\n", + "\n", + "\n", + "# Print accuracy:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "\n", + "\n", + "# print the confusion matrix\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with all features')\n", + "\n", + "# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:\n", + "# Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic\n", + "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", + "print(\"ROC_AOC_Score: \", roc_auc_score(y_test, y_pred_prob))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ga5-KhYN5xaD" + }, + "source": [ + "At this point, we can notice that the classifier is doing poorly with identifying relevant articles, while it is doing well with non-relevant ones. Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 686 + }, + "id": "ylOI4OsD5xaE", + "outputId": "cb0303c1-140f-4990-aa1c-e9ba1e3cc05b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 7.24 ms, sys: 0 ns, total: 7.24 ms\n", + "Wall time: 35.2 ms\n", + "Accuracy: 0.6876876876876877\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "vect = CountVectorizer(preprocessor=clean, max_features=5000) # Step-1\n", + "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", + "X_test_dtm = vect.transform(X_test)\n", + "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", + "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", + "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm\n", + "print(\"Accuracy: \", metrics.accuracy_score(y_test, y_pred_class))\n", + "# print the confusion matrix\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with max 5000 features')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2JzJ6k7g5xaL" + }, + "source": [ + "Clearly, the performance on relevance classification got better even though the overall accuracy fell by 10%. Let us try another classification algorithm and see if the performance changes. For this experiment, we have considered logistic regression, with class_weight attribute as \"balanced\", to address the problem of class imbalance in this dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "0v7pM9hB5xbA", + "outputId": "e86c81ce-cb3f-4268-8ccd-7daa7c4d3a66" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.7367367367367368\n", + "AUC: 0.6584385682402464\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression # import\n", + "\n", + "logreg = LogisticRegression(class_weight=\"balanced\") # instantiate a logistic regression model\n", + "logreg.fit(X_train_dtm, y_train) # fit the model with training data\n", + "\n", + "# Make predictions on test data\n", + "y_pred_class = logreg.predict(X_test_dtm)\n", + "y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]\n", + "\n", + "# calculate evaluation measures:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with normalization')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6v1evQyy5xbe" + }, + "source": [ + "Let us wrap this notebook by trying with one more classifier, but reducing the feature vector size to 1000." ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "from sklearn.linear_model import LogisticRegression # import\n", - "\n", - "logreg = LogisticRegression(class_weight=\"balanced\") # instantiate a logistic regression model\n", - "logreg.fit(X_train_dtm, y_train) # fit the model with training data\n", - "\n", - "# Make predictions on test data\n", - "y_pred_class = logreg.predict(X_test_dtm)\n", - "\n", - "# calculate evaluation measures:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with normalization')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6v1evQyy5xbe" - }, - "source": [ - "Let us wrap this notebook by trying with one more classifier, but reducing the feature vector size to 1000." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 }, - "id": "XJLKusAQ5xbf", - "outputId": "4dcdc0d5-4f4f-487a-ac44-2bc6778a0876" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.6836836836836837\n", - "AUC: 0.7251117679464362\n" - ] + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "XJLKusAQ5xbf", + "outputId": "cea4494e-a06d-41d0-c2a9-cdad88ff4a89" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.6926926926926927\n", + "AUC: 0.6742856032997147\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "from sklearn.svm import LinearSVC\n", + "\n", + "vect = CountVectorizer(preprocessor=clean, max_features=1000) # Step-1\n", + "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", + "X_test_dtm = vect.transform(X_test)\n", + "\n", + "classifier = LinearSVC(class_weight='balanced') # instantiate a Linear Support Vector Machine model\n", + "classifier.fit(X_train_dtm, y_train) # fit the model with training data\n", + "\n", + "# Make predictions on test data\n", + "y_pred_class = classifier.predict(X_test_dtm)\n", + "\n", + "# Like other Sklearn models, LinearSVC doesn't have implement .predict_proba, but we can get the same results\n", + "# by using .decision_function (predicts the confidence scores) and then applying softmax on the output\n", + "\n", + "# Softmax Function\n", + "def softmax(x):\n", + " e_x = np.exp(x - np.max(x))\n", + " return e_x / e_x.sum(axis=0)\n", + "\n", + "y_prob_intermediate = classifier.decision_function(X_test_dtm) ## Predicts the Confidence Scores\n", + "y_pred_prob = softmax(y_prob_intermediate)\n", + "\n", + "# calculate evaluation measures:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with normalization')" + ] }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf8AAAG7CAYAAADNOJzEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nOzdeZxVdf3H8debHVkUATdAQQXNLRc0tXLLBdPEshSt1DJt0V+WaWkLGbZauVRYaZGVC25pmCRphqW5MBhqoiiixoALq+zLwOf3xzmDh8vMvXeYuXNn7n0/e5yH95zzPed87szE53yX8z2KCMzMzKx6dCh3AGZmZta6nPzNzMyqjJO/mZlZlXHyNzMzqzJO/mZmZlXGyd/MzKzKOPlbxZDUXdK9kt6WdEczzvNxSX9rydjKRdL7Jc0o8TWWSdo5z/5XJR1dyhhag6TLJd2Uft4x/d4dW/gaJf99mYGTv5WBpDMk1aT/eL4u6a+S3tcCp/4osC3QNyI+trkniYibI+LYFoinpCSFpF3zlYmIf0XEbqWMIyJ6RsSsNKYbJX23lNdrCyLif+n3Xtec8+T+Dlvj92UGTv7WyiRdBFwDfJ8kUe8IXAeMbIHT7wS8GBF1LXCudk9Sp3LHUE7V/v3N8nHyt1YjaUtgDHB+RPwpIpZHxNqIuDciLknLdJV0jaS56XKNpK7pviMk1Ur6iqS30laDT6X7vgOMBk5LWxTOyTbTpmUGpzWtTun62ZJmSVoq6RVJH89sfyRz3KGSpqTdCVMkHZrZN1nSFZIeTc/zN0n9Gvn+9fF/NRP/yZI+KOlFSQslfT1T/iBJj0lanJb9haQu6b5/psWeTr/vaZnzf03SG8Dv6relx+ySXmP/dH0HSfMkHdFArJ+SdG9m/aVsV4qk2ZL2TT+HpF0lnQd8HPhqGtO9mVPuK+mZ9Gd4m6RujfyMzpb0iKSfSFqU/l6Oz+zfQdKE9HvMlHRuZt/lku6UdJOkJcDZ6e/nu5L+XR+TpL6Sbpa0JP19Ds6c49r0uy2RNFXS+xuJc8PfkqRD0nPXL6skvdqc32HmOu9Kv8NiSc9JOimz70ZJYyXdl/7tPSFpl4biNdtERHjx0ioLMAKoAzrlKTMGeBzYBugP/Bu4It13RHr8GKAz8EFgBdAn3X85cFPmXLnrg4EAOgE9gCXAbum+7YE9089nA4+kn7cGFgGfTI87PV3vm+6fDLwMDAO6p+s/bOS71cc/Oo3/XGAecAvQC9gTWAkMScsfABycXncw8Dzwpcz5Ati1gfP/COiaxnMEUJspcy4wHdgCmAT8pJFYdwYWk1QQdgBeqz9Pum8R0CE3DuBG4Ls553oVeDI9z9bp9/hcI9c9G1ibxtkR+DwwF1C6/58kLUXdgH3Tn99Rmd/3WuDkNO7638dMYBdgy/S7vwgcnf5c/wD8LnP9TwB9031fAd4AuuX+PZH5W8qJvzPwMPCDZvwOazPnmgl8HegCHAUs5Z2/2RuBBcBB6flvBsaX+//nXtrH4pq/taa+wPzI3yz/cWBMRLwVEfOA75Ak3npr0/1rI2IisAzY3D7S9cBekrpHxOsR8VwDZU4AXoqIP0ZEXUTcCrwAfChT5ncR8WJErARuJ0lKjVkLfC8i1gLjgX7AtRGxNL3+dODdABExNSIeT6/7KvBr4PAivtO3I2J1Gs9GIuIGkoTyBMkNzzcaOkkkffhL0+9yGMmNwlxJu6cx/Csi1heIJetnETE3IhYC95L/Z/RaRNwQSX/679M4t5U0CHgv8LWIWBUR04DfAGdmjn0sIu6JiPWZ7/+7iHg5It4G/gq8HBEPpn+HdwD7Zb73TRGxIP2Z/5TkJqopf18/I/m5fSM93+b8DusdDPQkuZlcExEPAX8huQGtd3dEPJl+l5vJ/3M128DJ31rTAqCf8vfF1tcy672WbttwjpybhxUk/0A2SUQsB04DPge8njad7l5EPPUxDcisv9GEeBbEO4PE6pPTm5n9K+uPlzRM0l8kvZE2Y3+f5GYhn3kRsapAmRuAvYCfR8TqPOUeJqmJHpZ+nkySuA5P15uiKT+jDWUjYkX6sSfJ72JhRCzNlM39Xcxu4Hy5P98Gf94Aki6W9HzaPbGYpLWg0M+8/tjPkvy8zqi/MdrM32G9HYDZOTdZzfnbM9vAyd9a02PAapJm2cbMJRm4V2/HdNvmWE7SvF1vu+zOiJgUEceQ1CxfIEmKheKpj2nOZsbUFL8kiWtoRPQmaf5VgWPyvqZTUk+SAZe/BS6XtHWe4vXJ//3p54cpnPxL+ZrQucDWknpltuX+Ljb7+mn//leBU0m6krYC3qbwz7z+2CuAkRGxJLNrc36H9eYCgyRl/51urb89q3BO/tZq0mbX0cDYdKDbFpI6Szpe0pVpsVuBb0rqr2Tg3GjgpsbOWcA04DAlz2RvCVxWv0PStpJGSupBckOyjKTJPNdEYJiSxxM7SToN2IOk+bXUepGMS1iWtkp8Pmf/myT9701xLVATEZ8B7gN+lafsw8CRQPeIqAX+RTJuoy/wn0aO2ZyYihIRs0nGgPxAUjdJ+wDnsPl/H7l6kYyZmAd0kjQa6F3ooLQ74nbgzIh4sYFzbu7v8AmS2vxX0/+fHEHS3TS+uK9j1jgnf2tVaT/qRcA3Sf6RnQ1cANyTFvkuUAM8AzwLPJVu25xrPQDclp5rKhsn7A5pHHOBhSS12dx/mImIBcCJJIO/FpDUDE+MiPmbE1MTXQycQdKHfAPJd8m6HPh9OhL81EInkzSSJHnXf8+LgP2VPuWQK01ky0iSPmmNdhbwaDT+fPtvgT3SmO5ppExznE4ycG4ucDfJ+IYHW+jck4D7SQYEvgasouFuhFwfIHls9c7MiP/68SOb/TuMiDUkyf54YD7JQMczI+KFzfhuZhupH0FrZmZmVcI1fzMzsyrj5G9mZlZlnPzNzMyqjJO/mZlZlfGLL9oAdeoe6tKrcEGzVrbfu3YsdwhmjXrqqanzI6J/qa/TsfdOEXWbTJjZJLFy3qSIGNFCITWbk38boC696LpbwSe1zFrdo0/8otwhmDWqe2flzr5ZElG3stn/Rq+aNrbYmR1bhZO/mZlZXgJVVi95ZX0bMzMzK8g1fzMzs3wEqNhXMrQPTv5mZmaFVFizv5O/mZlZIa75m5mZVRMP+DMzM7MWJmmEpBmSZkq6tIH9V0uali4vSlqc2XeWpJfS5axirueav5mZWSElbPaX1BEYCxwD1AJTJE2IiOn1ZSLiy5ny/wfsl37eGvg2MBwIYGp67KJ813TN38zMLB+RNPs3Z8nvIGBmRMyKiDXAeGBknvKnA7emn48DHoiIhWnCfwAoOJOga/5mZmZ5qdQD/gYAszPrtcB7GoxE2gkYAjyU59gBhS7o5G9mZlZ6/STVZNavj4jrN+M8o4A7I2Jdc4Jx8jczMyuk+aP950fE8Eb2zQEGZdYHptsaMgo4P+fYI3KOnVwoGPf5m5mZFSI1b8lvCjBU0hBJXUgS/IRNQ9DuQB/gsczmScCxkvpI6gMcm27LyzV/MzOzvEr7nH9E1Em6gCRpdwTGRcRzksYANRFRfyMwChgfEZE5dqGkK0huIADGRMTCQtd08jczMyuziJgITMzZNjpn/fJGjh0HjGvK9Zz8zczM8vGLfczMzKpQhU3v6+RvZmaWl+f2NzMzs3bONX8zM7NCOrjP38zMrHrUz+1fQZz8zczMCvFofzMzs2riAX9mZmbWzrnmb2ZmVoib/c3MzKpMhTX7O/mbmZnlU9yb+dqVyrqVMTMzs4Jc8zczMyvEzf5mZmZVpsKa/Z38zczM8vJz/mZmZtbOueZvZmZWiJv9zczMqohf7GNmZlZt3OdvZmZm7Zxr/mZmZoW4z9/MzKzKVFizv5O/mZlZIa75m5mZVRF5wJ+ZmZm1c675m5mZFeJmfzMzs+oiJ38zM7PqISov+bvP38zMrMo4+ZuZmeWjFlgKXUIaIWmGpJmSLm2kzKmSpkt6TtItme3rJE1LlwnFfCU3+5uZmeWlkjb7S+oIjAWOAWqBKZImRMT0TJmhwGXAeyNikaRtMqdYGRH7NuWaTv5mZmYFlLjP/yBgZkTMSq81HhgJTM+UORcYGxGLACLireZc0M3+ZmZm5TUAmJ1Zr023ZQ0Dhkl6VNLjkkZk9nWTVJNuP7mYC7rmb2ZmVkAL1Pz7SarJrF8fEdc34fhOwFDgCGAg8E9Je0fEYmCniJgjaWfgIUnPRsTLhU5mZmZmebRA8p8fEcMb2TcHGJRZH5huy6oFnoiItcArkl4kuRmYEhFzACJilqTJwH5A3uTvZn8zM7N8Sj/afwowVNIQSV2AUUDuqP17SGr9SOpH0g0wS1IfSV0z29/LxmMFGuSav5mZWRlFRJ2kC4BJQEdgXEQ8J2kMUBMRE9J9x0qaDqwDLomIBZIOBX4taT1Jhf6H2acEGuPkb2ZmlodK/KgfQERMBCbmbBud+RzARemSLfNvYO+mXs/J38zMrIBKm97Xyd/MzKwAJ38zM7MqU2nJ36P9zczMqoxr/mZmZvkU+XKe9sTJ38zMrIBKa/Z38jczM8ujNR71a23u8zczM6syrvmbmZkVUGk1fyd/MzOzQior9zv5m5mZ5aXKq/m7z9/MzKzKuOZvZmZWQKXV/J38zczMCnDyNzMzqyJ+zt/MzMzaPdf8zczMCqmsir+Tv5mZWV4V+Kifk7+ZmVkBTv5mZmZVptKSvwf8mZmZVRknf2tXjjn0XTx997f475+/zcWfOmaT/Vd+5SM8Pv5SHh9/Kc/cM5rX/3nlhn1//sUXeP2fV3LXtZ9rzZCtSvxt0v3ss+du7Ln7rvz4yh9usv/aq69iv3324MD99uH4Yz/Aa6+9tmHfNy77GgfsuxcH7LsXd9x+W2uGbcVSM5c2xs3+1m506CCuufRUTvj8L5jz5mIeufkS/vLws7ww640NZb760z9t+Pz5UYfz7t0Gbli/+g8PskW3LpxzyvtaNW6rfOvWreNLXzyf+/76AAMGDuR9Bx/IiSeexLv22GNDmX33249HP1vDFltswfW/+iXfuOyr3HTLbfx14n1M+89TPFEzjdWrV3PsB47guBHH07t37zJ+I8vlZn+zMjlwr8G8PHs+r85ZwNq6ddwx6SlOPGKfRsufOuIAbr9/6ob1yU++yNLlq1sjVKsyU558kl122ZUhO+9Mly5d+Nhpo/jLvX/eqMzhRxzJFltsAcBB7zmYObW1ADz//HTe9/7D6NSpEz169GDvvffhb5Pub/XvYI2T1OylrXHyt3Zjh222pPbNRRvW57y5iAH9t2yw7I7b92GnHfoyecqM1grPqtjcuXMYOHDQhvUBAwYyZ86cRsvf+LvfctyI4wHYZ59387dJ97NixQrmz5/Pww//g9ra2SWP2apbqzb7Swrgqoj4Srp+MdAzIi7Pc8zJwIsRMb2BfZcD5wLzgC7AFRFxa4EYlkVEz83+Ek0g6WzgbxExtzWuZ+/42HEHcM/fp7F+fZQ7FLON3HrzTTw1tYYHHnoYgKOPOZapNVM48v2H0q9/f97znkPo2KFjmaO0XG2x9t4crV3zXw18RFK/JhxzMrBHnv1XR8S+wEjg15I6NyfAFnY2sEO5g6gUc996m4Hb9tmwPmDbPsyZ93aDZT963AHcfn9Na4VmVW6HHQZsVFufM6eWAQMGbFLuob8/yI9++D3uvHsCXbt23bD9a5d9gyemTuO++x8gCIYOG9YqcVvx3OzfPHXA9cCXc3dIGizpIUnPSPq7pB0lHQqcBPxY0jRJuzR24oh4CVgB9EnPd4mkKen5vtPQMQ2VkfRDSednylwu6WJJPdO4npL0rKSRmbifl3SDpOck/U1Sd0kfBYYDN6exd9/cH5olap57jV137M9OO/Slc6eOfOy4/blv8jOblBs2eFv69N6Cx59+pQxRWjUafuCBzJz5Eq++8gpr1qzhjtvGc8KJJ21UZtp//sMFX/gsd/5pAttss82G7evWrWPBggUAPPvMM/z32Wc4+phjWzV+K4JH+zfbWOAZSVfmbP858PuI+L2kTwM/i4iTJU0A/hIRd+Y7qaT9gZci4i1JxwJDgYNIfuwTJB0WEf/MlG+wDHAbcE0aJ8CpwHHAKuDDEbEkbbl4PI2N9DynR8S5km4HTomImyRdAFwcEZtUQSWdB5wHQOdW6YVo99atW8+Xf3Q79153Ph07iN//+XGen/UG3/r8CTw1/X/c9/CzQNLkf8ekqZsc/+Bvv8SwIdvSs3tXZt5/BZ/7zi08+Njzrf01rAJ16tSJq6/9BR864TjWrVvHWWd/mj323JMxl49m/wOGc+KHTuLrl17C8mXL+PiojwEwaMcdufPuCaxdu5ajj3w/AL169WbcjTfRqZMfxLLSUkTr9YnW97dLGgOsBVaS9vlLmg9sHxFr06b71yOin6QbaST5Z/r8FwPDgA9FxP2SfgJ8NN0O0BP4QUT8NhNDvjLPAx8A+gPXRcR705iuBg4D1gO7AUOAbsADETE0jelrQOeI+K6kyTSS/LM6bLFNdN3t1Cb+NM1Kb9GUX5Q7BLNGde+sqRExvNTX6brt0Bjw8WubdY5Xrj6hVWItVrluL68BngJ+1wLnujoifiLpJOC3adeASBL5r/Mcl6/MHSQ3BtuRtAQAfJzkZuCA9AblVZLED8lYhnrrADfxm5lVigp8sU9ZHvWLiIXA7cA5mc3/Bkalnz8O/Cv9vBToVcQ5JwA1wFnAJODTknoCSBogaZucQ/KVuS2N5aMkNwIAWwJvpYn/SGCnIr5qUbGbmVnbJUBq3lLwGtIISTMkzZR0aSNlTpU0PR1fdktm+1mSXkqXs4r5TuXsWPopcEFm/f+A30m6hOTRvU+l28cDN0j6IvDRiHg5zznHALcA70qXx9K7tWXAJ4C36gtGxN8kNVgmIp6T1AuYExGvp4fcDNwr6VmSm4wXiviONwK/krQSOCQiVhZxjJmZVRFJHUnGmR0D1AJTJE3IPuIuaShwGfDeiFhUX1mVtDXwbZIB5gFMTY9dlHudrFZN/tnn6yPiTWCLzPprwFENHPMojTzqlzs/QERMJemLB7g2XfLF0GCZdN/eOevzgUMaKgvslSn3k8znu4C7GjnGzMzahZI/rncQMDMiZgFIGk/y+Hp2fptzgbH1ST0i6iuzx5GMO1uYHvsAMALIO+eNZ/gzMzMroAWa/ftJqsks52VOPwDITutYm27LGgYMk/SopMcljWjCsZvw8yRmZmYFtEDNf34zR/t3Inms/AhgIPBPSXvnPSIP1/zNzMzyaWatv4j7hjnAoMz6wHRbVi0wISLWRsQrwIskNwPFHLsJJ38zM7PymgIMlTREUheSp80m5JS5h6TWTzrR3DBgFsmTa8dK6iOpD3Bsui0vN/ubmZnlIaBDh9IN+IuIunRG2ElAR2Bc+tTZGKAmfZS9PslPJ5lP5pKIWAAg6QqSGwiAMfWD//Jx8jczMyug1HP8RMREYGLOttGZzwFclC65x44DxjXlek7+ZmZmBXiGPzMzM2vXXPM3MzPLp8gpetsTJ38zM7M8krn9Kyv7O/mbmZnlVfLpfVud+/zNzMyqjGv+ZmZmBVRYxd/J38zMrJBKa/Z38jczM8unAkf7u8/fzMysyrjmb2Zmlocf9TMzM6tCFZb7nfzNzMwKcc3fzMysylRY7veAPzMzs2rjmr+ZmVk+crO/mZlZVUlG+5c7ipbl5G9mZpaXX+xjZmZm7Zxr/mZmZgVUWMXfyd/MzKyQSmv2d/I3MzPLxy/2MTMzs/bONX8zM7M8/GIfMzOzKuTkb2ZmVmUqLPe7z9/MzKzauOZvZmZWgJv9zczMqkkFPurn5G9mZpaHPLe/mZlZ9ZGatxQ+v0ZImiFppqRLG9h/tqR5kqaly2cy+9Zltk8o5vu45m9mZlZGkjoCY4FjgFpgiqQJETE9p+htEXFBA6dYGRH7NuWaTv5mZmYFdChts/9BwMyImAUgaTwwEshN/i3Gzf5mZmYFtECzfz9JNZnlvMzpBwCzM+u16bZcp0h6RtKdkgZltndLz/m4pJOL+T6u+ZuZmeWRJPBm1/znR8TwZhx/L3BrRKyW9Fng98BR6b6dImKOpJ2BhyQ9GxEv5zuZa/5mZmblNQfI1uQHpts2iIgFEbE6Xf0NcEBm35z0v7OAycB+hS7o5G9mZlZABzVvKWAKMFTSEEldgFHARqP2JW2fWT0JeD7d3kdS1/RzP+C9FDFWwM3+ZmZmBZTyOf+IqJN0ATAJ6AiMi4jnJI0BaiJiAvBFSScBdcBC4Oz08HcBv5a0nqRC/8MGnhLYRKPJX9I8IJoQ/DbFljUzM2tPSj3HT0RMBCbmbBud+XwZcFkDx/0b2Lup18tX8x9LE5K/mZmZtQ+NJv+IuLwV4zAzM2uTRDLFbyVpUp+/pD7AXiSjEv8aEYskdQPWRMT6UgRoZmZWbkUM2mtXikr+kjoB3wfOB7qTdAccCCwC7gJqgG+XKEYzM7PyUfW+2Od7wLnABcDOsFH7x5+BD7VwXGZmZlYixTb7nwlcGhG/S19AkPUyyQ2BmZlZRaqwin/RyX8rkiTfkC4kzyWamZlVHFHyF/u0umKb/f9L8oahhhwPPNUy4ZiZmbU9LfBinzal2Jr/d4G7JHUH7iAZ8LevpA8DnyWZatDMzKwiVeWAv4j4M3AGcDTwV5JWkN+QTC/4yYiYVKoAzczMrGUV/Zx/RNwO3C5pN6AvydzCMyLCswCamVnFaqtN983R5Bf7RMSMUgRiZmbWVlXrgD8k7S3pFkkzJS1P/3uLpH1KGaCZmVm5qZlLW1PsDH8nA7eTPO53J/AWsA3JEwA1kk6NiHtKFqWZmZm1mGKb/X9EMpPfqdk+fkmXkYz+/xHg5G9mZhWpKkf7k7zI5ze5g/vS9RvS/WZmZhUnmeSneUtbU2zyrwH2bGTfXniSHzMzq1Tpi32as7Q1jTb7S9ois3oRMF5SZ5Lm/fo+/w8DnwFGlTJIMzMzazn5+vyXkczkV0/AD0he7ZvdBvAEnt/fzMwqVBusvDdLvuT/aTZO/mZmZlWpLTbdN0ejyT8ibmzFOMzMzNqk+gF/laToSX7MzMysMhQ9va+k04BzgWFAt9z9EbFNC8ZlZmbWZlRas39RNX9JZwC/B2YCA4EJwF/S45cAvyhVgGZmZuVWadP7FtvsfwlwBXB+un5dRHwaGALMB1aUIDYzM7Oyk5IX+zRnaWuKTf5DgUcjYh2wDugNEBFLSab2vaA04ZmZmZVf/Wt9N3dpa4pN/kuArunnOcC7MvsE9G3JoMzMzKx0ih3wNwXYB5hE0t8/WlIdsAYYDTxemvDMzMzKr9IG/BWb/H8A7JR+Hp1+/iVJy8EU4LMtH5qZmVnbUGG5v7jkHxGPk9buI2IxMFJSV6BrRCwpYXxmZmZlJdrmoL3m2OxJfiJitRO/mZlZ80kaIWmGpJmSLm1g/9mS5kmali6fyew7S9JL6XJWMdfL91a/K5sQd0TE15pQ3szMrH0o8Yh9SR2BscAxQC0wRdKEiJieU/S2iLgg59itgW8Dw0nexzM1PXZRvmvma/b/WBNiD8DJfzMNGLQtF15zUbnDMNvE6TfWlDsEszahxAP+DgJmRsSs9FrjgZFAbvJvyHHAAxGxMD32AWAEcGu+g/K92GdIkUGbmZlVtBZ4EU4/Sdm76esj4vr08wBgdmZfLfCeBs5xiqTDgBeBL0fE7EaOHVAomKLn9jczM7PNNj8ihjfj+HuBWyNitaTPkky5f9Tmnsxv9TMzM8tDJM3+zVkKmAMMyqwPTLdtEBELImJ1uvob4IBij22Ik7+ZmVkBHdS8pYApwFBJQyR1AUaRTKi3gaTtM6snAc+nnycBx0rqI6kPcGy6LS83+5uZmRVQRALfbBFRJ+kCkqTdERgXEc9JGgPURMQE4IuSTgLqgIXA2emxCyVdQXIDATCmfvBfPk7+ZmZmZRYRE4GJOdtGZz5fBlzWyLHjgHFNuV6Tkr+SjouBJP0LT0fE8qYcb2Zm1t4kb+ar0hn+JH2BZBDBa8C/gN3S7X+S9KXShGdmZlZ+Je7zb3VFJX9JlwBXATeQPFqQ/SqTgdNaPDIzM7M2Qmre0tYU2+x/PjA6Iq5MpyHMmgEMa9mwzMzMrFSKTf7bAVMb2bce6NYy4ZiZmbUtgqp9q99M4PBG9h1GcfMPm5mZtUsdmrm0NcXW/K8BrpO0Brgz3baNpHOAi4BzSxGcmZlZW1BhFf/ikn9E/CadOWg08J1080RgBXB5RNxSovjMzMzKSlLFNfsX/Zx/RPxY0q+AQ4G+JDMMPRYRb5cqODMzM2t5TZrkJyKWUsScwWZmZpWkwir+xSX/dIKfvCLiuuaHY2Zm1va0xYl6mqPYmv8v8uyL9L9O/mZmVnGq9lG/iOiQuwBbA6cDTwN7lDJIMzMzazmb/Va/iFgM3CZpS+DXwBEtFZSZmVlbUmEV/xZ5pe8rwPAWOI+ZmVnb00ZfztMczUr+krYHvkJyA2BmZlaRRGVl/2JH+8/jnYF99boAvYBVwEdaOC4zMzMrkeaM9l8F1AL3R8SClgvJzMys7UhG+5c7ipZVMPlL6gw8CLwSEXNLH5KZmVnbUmnJv5hH/dYBDwG7lzgWMzOzNklSs5a2pmDyj4j1wEvAdqUPx8zMzEqt2D7/bwA/kvRsRDxbyoDMzMzakqrq85d0GPBURCwDvknyJr9pkuYAb5Iz+j8iDiploGZmZmWh6prk5x/AIcCTwH/TxczMrOpU2tz++ZL/hm8aEZ9qhVjMzMzanEps9i/qxT5mZmZWOQoN+PugpKIe8YuIP7RAPGZmZm1OhbX6F0z+o4s8TwBO/mZmVoFEhyqb2/9IoKY1AjEzM2uLRPXV/FdGxPJWicTMzMxahQf8mZmZ5aNktH9zloKXkEZImiFppqRL85Q7RVJIGp6uD5a0UtK0dPlVMV+p2Bn+zMzMqlYpn/OX1BEYCxxD8rbcKZImRMT0nHK9gAuBJ3JO8XJE7NuUazZa84+IDhHxZFNOZmZmVmnq+/ybsxRwEDAzImZFxBpgPDCygXJXAD8CVjX3O7nZ38zMrPT6SarJLOdl9g0AZmfWa9NtG0jaHxgUEfc1cO4hkhQudgEAAB5ESURBVP4j6WFJ7y8mGDf7m5mZFdACzf7zI2L45hwoqQNwFXB2A7tfB3aMiAWSDgDukbRnRCzJd07X/M3MzAoocbP/HGBQZn1guq1eL2AvYLKkV4GDgQmShkfE6ohYABARU4GXgWGFLuiav5mZWR6i5DXlKcBQSUNIkv4o4Iz6nRHxNtBvQzzSZODiiKiR1B9YGBHrJO0MDAVmFbqgk7+ZmVkZRUSdpAuASUBHYFxEPCdpDFATERPyHH4YMEbSWmA98LmIWFjomk7+ZmZm+QhU4in+ImIiMDFnW4NT7EfEEZnPdwF3NfV6Tv5mZmYFVNjsvk7+ZmZm+YjSTvJTDk7+ZmZmBVRW6vejfmZmZlXHNX8zM7MCKqzV38nfzMwsP5V8tH9rc/I3MzPLoxUm+Wl1lfZ9zMzMrADX/M3MzApws7+ZmVmVqazU7+RvZmaWXytM79va3OdvZmZWZVzzNzMzy6MSR/s7+ZuZmRVQac3+Tv5mZmYFVFbqr7yWDDMzMyvANX8zM7MCKqzV38nfzMwsn2TAX2Vlfyd/MzOzAlzzNzMzqypCFVbz94A/MzOzKuOav5mZWQFu9jczM6siHvBnZmZWbVR5NX/3+ZuZmVUZ1/zNzMwKqLSav5O/mZlZAZX2qJ+Tv5mZWR4COlRW7nefv5mZWbVxzd/MzKyASmv2d83fzMysAKl5S+Hza4SkGZJmSro0T7lTJIWk4Zltl6XHzZB0XDHfxzV/a1cG9+nOUbv2RRLPvr6EJ2e/3WC5of16MHLPbfnj1FreXLYGgH49unDssH506diBCLjpqTmsi2jN8K2C7TewN+ccvCMdBA/OmM+fnnljo/1HDu3LWQcNZOGKtQBMnP4WD86Yv2F/984d+NlH9+LJVxdzw2P/a9XYrbBS1vwldQTGAscAtcAUSRMiYnpOuV7AhcATmW17AKOAPYEdgAclDYuIdfmu6eRv7YaAo4f2445nXmfp6jo+sf8AXl6wggXpP6b1OncU+w/ozdwlqzY69oTd+zPxhXnMW76Gbp06sN6J31pIB8F5h+7I5X99kQXL13LlyHfx5P8WU7t41UblHp21qNHEfsYBA5j++tLWCNfanoOAmRExC0DSeGAkMD2n3BXAj4BLMttGAuMjYjXwiqSZ6fkey3dBN/tbu7Fd764sWrmWt1fVsT7ghbeWs0vfHpuUe9/grZkyezHr1r+T3Adv3Z15y9cwb3nSCrCqbj1O/dZShvbvwetLVvPm0jXUrQ8embWQg3baqujjd+67BVt278y0OUtKGKVtrvrR/s1ZgH6SajLLeZlLDABmZ9Zr023vxCDtDwyKiPtywit4bENc87d2o1eXTixdXbdhfdnqOrbv3XWjMtv07EKvrp2YtXAlBw565x/fPt07E8Ape2/HFp078sK8ZUxppMvArKm23qIL89MbS4AFy9cwrH/PTcodPGQr9ti+J3PfXsW4x2ezYPlaBHzq4EFc849Z7DOgdytGbcVrkVf6zo+I4YWLNXB1qQNwFXB2c4Oo1+6Tv6R1wLMk3+UV4JMRsThP+cuBZRHxk1aIbTBwaETcUuprWeLIXfry1xfmbbK9g8TA3t246ak5rF0fnPru7Xlz6Wr+l9Msa1YqNf9bzL9eXkjd+uDY3ftx4eFDGD3xRUbs0Z+ps9/epPvK2pDSz+0/BxiUWR+YbqvXC9gLmKwkkO2ACZJOKuLYBrX75A+sjIh9AST9Hjgf+F55Q9pgMHAG4OTfApauqaNX13f+ZHt27cTS1e+MaenSUfTt0YXT9t0egB5dOvLhvbbj7v++wdLVddS+vYqVdesBmLVgBdv27Orkby1i4Yo19OvRZcN63x5dWLBizUZlsn+rD86Yz5kHDQRgt216ssd2PTn+Xf3p1rkDnTp0YFXdOv44peC/39aKSvyg3xRgqKQhJIl7FEnuACAi3gb6bYhFmgxcHBE1klYCt0i6imTA31DgyUIXrITkn/UYsA+ApF1IRk/2B1YA50bEC9nCDZUBXgeeAYZExHpJPYAXgJ1JmlzOA7oAM0laGVZIuhFYAgwnuSP7akTcCfwQeJekacDvI+Lq0n31yvfGktX06d6ZLbslzf+7b9OD+55/a8P+NeuC6/792ob10969PZNfXsCby9aweFUdBw3aik4dxLr1waCtujG11s3+1jJemrec7Xt3Y5ueXVi4Yi3v23lrrv7HrI3K9OnemUUrk9r9gTtutWEw4DWTX9lQ5sihfdm1Xw8n/ioTEXWSLgAmAR2BcRHxnKQxQE1ETMhz7HOSbicZHFgHnF9opD9UUPJPH5X4APDbdNP1wOci4iVJ7wGuA47KOWyTMhFxVJqsDwf+AZwITIqItZL+FBE3pNf7LnAO8PP0XNsD7wN2ByYAdwKXktydnViab11dAvj7zPmcsvd2dJB49o2lLFixlvcO7sMbS1fz8oIVjR67um49NbVv84n9k3EwsxauYNbCla0UuVW69QE3/Pt/fPv4YXQQ/P3FBcxevIrT99+BmfOXM+V/b3PCnttw4E5bsW59sGx1HT9/+NVyh21FSgb8lbbuHxETgYk520Y3UvaInPXv0cQW70pI/t3TZD0AeB54QFJP4FDgDr3zC9toZFiBMrcBp5Ek/1EkNw4Ae6VJfyugJ8ldWr17ImI9MF3StoWCTkd6ngew1bY7FP1lq90rC1fyysLajbY9+uqiBsve9vTrG60//9Yynn9rWclis+r2VO3bPHXHxq1Jtz41d8Pnm2rmcFNN/hr9P15awD9eWlCS+Kx5Kmt+v8pI/isjYl9JW5Ak4/OBG4HF9WMBGtEhT5kJwPclbQ0cADyUbr8RODkinpZ0NnBE5pjVmc8F/04i4nqSlgcG7ba3nzozM2vLKiz7V8xz/hGxAvgi8BWS/vtXJH0MQIl355Rf0liZiFhGMgDjWuAvmf6TXsDrkjoDHy8irKXpMWZmZm1GxSR/gIj4D8lgvdNJkvM5kp4GniOZBSlXvjK3AZ9I/1vvWyTTKj5KMgiwkGeAdZKelvTlJn4dMzNrI9TM/7U17b7ZPyJ65qx/KLM6ooHyl2c+v9JQmXTfneQ09ETEL4FfNlD27IZiioi1bDrI0MzM2pkSj/drde0++ZuZmZVaheX+ymr2NzMzs8Jc8zczMyukwqr+Tv5mZmZ5CNrkoL3mcPI3MzPLp/Qv9ml17vM3MzOrMq75m5mZFVBhFX8nfzMzs4IqLPs7+ZuZmeXVNmfpaw4nfzMzswI84M/MzMzaNdf8zczM8hAV1+Xv5G9mZlZQhWV/J38zM7MCKm3An/v8zczMqoxr/mZmZgVU2mh/J38zM7MCKiz3O/mbmZnlVYHD/d3nb2ZmVmVc8zczMyug0kb7O/mbmZnlITzgz8zMrOpUWO53n7+ZmVm1cc3fzMyskAqr+jv5m5mZFVBpA/7c7G9mZlaA1Lyl8Pk1QtIMSTMlXdrA/s9JelbSNEmPSNoj3T5Y0sp0+zRJvyrm+7jmb2ZmVkAp6/2SOgJjgWOAWmCKpAkRMT1T7JaI+FVa/iTgKmBEuu/liNi3Kdd0zd/MzKy8DgJmRsSsiFgDjAdGZgtExJLMag8gmnNB1/zNzMwKaX7Vv5+kmsz69RFxffp5ADA7s68WeM8mIUjnAxcBXYCjMruGSPoPsAT4ZkT8q1AwTv5mZmZ5JFP7Nzv7z4+I4c05QUSMBcZKOgP4JnAW8DqwY0QskHQAcI+kPXNaCjbhZn8zM7N8mjnYr4gBf3OAQZn1gem2xowHTgaIiNURsSD9PBV4GRhW6IJO/mZmZuU1BRgqaYikLsAoYEK2gKShmdUTgJfS7f3TAYNI2hkYCswqdEE3+5uZmRVQytH+EVEn6QJgEtARGBcRz0kaA9RExATgAklHA2uBRSRN/gCHAWMkrQXWA5+LiIWFrunkb2ZmVkiJ5/iJiInAxJxtozOfL2zkuLuAu5p6PSd/MzOzvOQZ/szMzKx9c83fzMysgGKm6G1PnPzNzMzyEBX3Uj8nfzMzs4IqLPu7z9/MzKzKuOZvZmZWQKWN9nfyNzMzK8AD/szMzKpMheV+J38zM7O8ins5T7viAX9mZmZVxjV/MzOzgiqr6u/kb2ZmloeovGZ/J38zM7MCKiz3u8/fzMys2rjmb2ZmVoCb/c3MzKqMZ/gzMzOrNpWV+93nb2ZmVm1c8zczMyugwir+Tv5mZmb5qAKn93XyNzMzK6DSBvy5z9/MzKzKuOZvZmZWSGVV/J38zczMCqmw3O/kb2ZmVogH/JmZmVUVecCfmZmZtW+u+ZuZmeUhKq/Z3zV/MzOzKuPkb2ZmVkD9LH+buxQ+v0ZImiFppqRLG9j/OUnPSpom6RFJe2T2XZYeN0PSccV8Hyd/MzOzMpLUERgLHA/sAZyeTe6pWyJi74jYF7gSuCo9dg9gFLAnMAK4Lj1fXk7+ZmZmBaiZ/yvgIGBmRMyKiDXAeGBktkBELMms9gAi/TwSGB8RqyPiFWBmer68PODPzMwsn5Z5sU8/STWZ9esj4vr08wBgdmZfLfCeTcKQzgcuAroAR2WOfTzn2AGFgnHyNzMzy0O0yAx/8yNieHNOEBFjgbGSzgC+CZy1uedys7+ZmVl5zQEGZdYHptsaMx44eTOPBZz8zczMClMzl/ymAEMlDZHUhWQA34SNLi8NzayeALyUfp4AjJLUVdIQYCjwZKELutnfzMysgFJO7xsRdZIuACYBHYFxEfGcpDFATURMAC6QdDSwFlhE2uSflrsdmA7UAedHxLpC13TyNzMzK6DUM/xFxERgYs620ZnPF+Y59nvA95pyPTf7m5mZVRnX/M3MzAqosKn9nfzNzMwKqrDs7+RvZmZWQCkH/JWDk7+ZmVkelfhKX0VE4VJWUpLmAa+VO44K0g+YX+4gzBrhv8+Ws1NE9C/1RSTdT/J7a475ETGiJeJpCU7+VnEk1TR3Gk2zUvHfp7UFftTPzMysyjj5m5mZVRknf6tE1xcuYlY2/vu0snOfv5mZWZVxzd/MzKzKOPmbmZlVGSd/s0ZIybQe9f81M6sUTv5mDZDUg3f+/9G7nLGYmbU0T+9rlkNSF+BU4C1JewGHSDoFWB8eIWutQFKPiFieft4qIhaXOyarLE7+ZjkiYo2kp4B7gbXAhyJiXZnDsiohqTswUtJ8oA8wUNLPI2JNmUOzCuLkb5aSpPqafUQ8LelW4DBgb0lzs7WvbFmzFrYWeBn4I9AZ2Du9Ie0QEevLG5pVCvf5m7FxMpf0fkkDgK8D5wCfBz6W7jtJ0o5O/NbS6geWRkQd8DawnOQm4Jh0uxO/tRhP8mNVLyfxnwt8E/gvMAX4JbADcBXwInAKcEhEvFSmcK0C5fwNbge8GREh6SjgIuDOiLhR0gHAWxExu5zxWvvn5G+WkvQx4CjgYmAP4ENAV+AnJM2v+wLPR8QrZQvSKk5O4r+E5AazE/C9iLhb0odJWp8WANsBp0fEG2UL2CqC+/zN2DDI6tPAruko6ymSOgAnAKOBn0fExHLGaJUpk/jfCxwBjASGA7ek9wV3S5oNnAmMceK3luCav1WlhgbsSeoP3A28FBGfSre9DzgcuD4i5rV+pFYNJB0IXA68ERHnpNs+SDLo74sRcXMZw7MK5ORvVSenmfUzQF9geUT8QtI2wG9I+lU/k5bpFhGryhexVZrcm8+0lekrwKHAtcDjEbFK0kjg58BewDIP+rOW4uRvVUvS/wGnAV8GHgV+HBHfSFsA7gT+GxHn+7E+a0k5N5+nAOuAV9LHSy8FhgDjSW4AVmYn/DFrKU7+VpUyNfxPA6eT9LP2Ax6LiM+nNwDdPKraSkXSRcDJwIPAwcAtEXGTpIuB/YFfRcQ/ffNppeDn/K3qpAOrugJnAPsAp0bE0SQDqj4r6SsRMc+J31pS2rRf//k9wGERcRjJwOvewAhJZ0bET4AnSB4txYnfSsHJ36qKpL7Ap4ChEbEs3TxXUidgZ2AcyaA/sxZV318vaeeIeAL4kqRRJLNIngDUAhdLOisirvWofislJ3+rKhGxAHgW+KGkPsCrwCLgT8CVwI8iYlb5IrRKpcQOwBOSDo6IV0me278xIt4G3gD+AjxQxjCtSrjP36pC2sy6fUTck67/GHg67WMdCvQneczKid9aTH1/fc4gvy8BayLiOkmfJBl7ci3JmySPjoiZZQzZqoRr/laR6udJTz93AkYAH5N0j6RdgIUks/gRES9FxL+d+K2lZfrrD89sfgY4JR3F/0eSGf1mAsc58Vtrcc3fKk5OLesIYAnJ1KizSabqXQ3sTjLC/4yIGF+mUK1CZWv8QBfgXyQD+O4HbiOZNbI/cL5fF23l4Ol9rWLU/4ObM0/6CSRvRhsAXBQRF0kaTDJ96gCSl/eYtZicR/OGAa8D7wFOJBncdzHJzH3DSSaYeqsccVp1c/K3StIVWCWpI0mT/qERcYSkbwPbAi9J6pgOtHpV0n0RsbKM8VoFytx8/h/wUeAFoH9EfAS4Nx3hfyxwAMkLo8xanfv8rd1LR1HvAryWPka1DlgBzJY0lqTW9ZGIWAscL6lXeqin7LUWk74cqv7zB0km8DmRpMupV3pTStrN9CWSl0jNKUesZk7+1u6lLf0vA78FJksakq73IJnE57yIWCPpHGAM0K3+uLIFbRVF0m7AlyXtmm5aDPwCOIekef+EiFgn6RiAiFgSEUvKE62ZB/xZO5cOqFJmApXvAOfyTp/+Z0lmT/sfcDzJbH7PlSlcq1CSjiJp4n+VZEDf1sBDwIyIODgtczZwHPBZJ34rNyd/a7dyRvUPqp+OV9LXgS+QzI++FjgG6A48krYImLWInL/BI4GTgHnAz0geL72G5MVRu5I8x//JiPhvmcI128DJ39q9dNKU95M0tT4UETenLQBnAh/w8/tWCg29cEfSu0lanuYC1wHvAz6Q7v51RLzQulGaNczJ39o1SSeTDJ76APAIMC0iPp/u+yHwYZJ3oa/zu9CtFCRdQPJeiN5A/ZMlnySZrvd3EeFH+azNcfK3dkXSlkDHiFiYrp8DrCQZ3PdR4EPp4L5BETFbUr+ImF/GkK2C5Nb2JX2eZIa+84C7gH9GxIWSDiNpeXqOZOBfnQeYWlvi5/yt3ZB0IskEKX0l/SoixpIMsLoSWBIRR6blLgJ2kXQhyWNWZi2lC8kMkfW2JenL/xRJU/8lkjqTzOi3HKhNHzE1a1Oc/K1dkDQC+B5JDas3ME7SLJIZ+p4GXpF0Asko60+QDKyqK1e8VnkkHQt8XtI04L8RcRewA8mUvTOBkRFRl3YDrImI68sYrllefs7f2jxJ25CMmJ4cEU9ExAO8M4J6Ick86QuAUSSPUp3px/msJaU3n1cAD5L8u3m8pK2BnwLbA/9JE//ZJE+aTC5TqGZFcZ+/tQvpP6p7kDw3/VtJN5M8ytcT+DlJTesaSZ3dzGotKU3y80lq9vdKGkjSCnVDRDwiaU/gRpL+/aHAuRExvWwBmxXByd/atJznqM8E9gP2BNYDZwD7AgeRvDDlXGCuB1ZZS0u7lK4EDomIJZLuA7YEngKeBB4jHV8SEYvLFqhZkZz8rc3LuQE4jaRP/y8R8euGypiVgqTjSSbvuZ+ky+l6YBuSm85pwJcjYmn5IjQrnpO/tTnpC1LWpn2o3SJiVc4NwCeBdwNzSJpel5UzXqseko4G/gZsHxFvpts6AFv7kVJrTzza39oUST1JJuypTf+h7SjphxGxvv4GICL+KKkrSe3Lf8PWaiLiwbQL4B+SjoiIt9LJo5z4rV3xP5zWpkTEMkm9gd+R/H1+tH5mvoiIzA3AbyT19gtSrLVFxF8ldQHulzTcM0dae+Rmf2sTcpr1+5Ik/7XA1STPVC9uqKxZuUjq6S4na6/8nL+VXU7iHwZ0Jpk17VaSV/K+L923X1rbd+K3snPit/bMNX9rMyR9ATgHmAH0AU4mmTf9GKAu/e8hEfF62YI0M6sA7vO3spHUq/7RKEnvJ5m692SSOdKvJpkf/ZB0fXfgKid+M7Pmc7O/lYWkXYBvSTow3bQYeCwiXiV5zO98YBbw4YiYHBG/iojnyxSumVlFcfK3ctmSZJa+D0val2R2tGMlnZjp059L8qpeMzNrQe7zt1Ylaav6kfvpnOijgO7AT0ie27+b5GUpHUn6+0dFxItlCtfMrCK55m+tJp2050lJ16bN/QuBscAy4EKS16IeQ9Ii0Av4uBO/mVnLc83fWk3avP84sAb4OknC/xHJYL55JPOkXxMRs8sWpJlZFfBof2s1ETFN0v7Aw8AS4FjgSOAAkjEA+wIdJH2NZNCf70zNzErANX9rdWmT/4PAhRFxo6SOJC/qORb4s0f1m5mVlpO/lUV6A/A34BsRcV254zEzqyZu9reyiIgp6QDAKZJWRcS4csdkZlYtXPO3spK0H7AiImaUOxYzs2rh5G9mZlZl/Jy/mZlZlXHyNzMzqzJO/mZmZlXGyd/MzKzKOPmbmZlVGSd/sxYg6XJJkVnmSrpL0i4lvOaJ6bUGp+uD0/UTm3COUyWd3YIx9UxjaPScmxNnetyNkmqaHWRyrsmS7myJc5m1R57kx6zlvA2MSD/vDFwB/F3SnhGxvBWu/zpwCPBCE445FegH3FiKgMysbXLyN2s5dRHxePr5cUn/A/4FfBC4I7ewpO4RsbKlLh4Rq0nemmhmlpeb/c1KZ2r638EAkl6V9FNJ35JUS/JmQyR1kHSppJmSVkt6UdJZ2RMpcbmktyQtlfQHoHdOmQab0yWdK+lZSaskvSnpTklbSroROAU4PNNdcXnmuJGSatLj3pB0paTOOec+JY13paR/kryeuckknSnpEUkLJS2S9A9Jwxspe7KkF9K4HpG0R87+gj9Ps2rnmr9Z6QxO//tGZtsZwHPAF3jn/38/B84CxgBPAccA4yQtiIi/pGW+CIwGvk/SmvAR4MpCAUj6Znre64BLgC2AE4CeJN0SOwJbpfEA1KbHnQrcCvwa+DqwC/ADkgrDxWmZ/YHbgLuBC4G9gNsLxdSIwcAfgJeBLsDpwL/SLpNZmXI7AVcB3wJWAt8BJkkaGhGr0jLF/DzNqltEePHipZkLcDkwnyShdwKGAf8gqd1vn5Z5laRfvlvmuF2B9cBZOef7AzAl/dwRmAv8MqfMA0AAg9P1wen6ien6VsD/t3cHoXVUURjH/6cWaxc2QkpooFVwkRLcpF0UEpdWujNFK3GlohIibqRCoRRKi4hUpLgSodUEdJGGtGlcqCgtwUWttOpODBRK0CTNo5aGSqi24XRx7jOvk0nmJQQeZL4fDJOZuW/enUvCmXvn3MkccHKZeg8DY5l9BkwA/Zn9bxABtzltDwG/k14TnvYdSXV4fZnvfKieOcc3pDb8Azhas38gfa6rZt9TwH2gr972TNtjwHCjf2+0aGnUomF/kbXTDNxLyziR9Nfj7tM1ZS74Qg8V4DkiWI2Y2cbqAlwAOszsEWAH0AqMZr7vXEF9OoHNQP8Kr6ONGBEYytTpIvAY0cMH2AN87e61/yCkqE65zKzdzEbMbAaYJ9pwZ6pLrYq7X6puuPsE8XhlT9pVT3uKlJ6G/UXWziywl+id3gCmMoERYCazvZXo2c8ucc5WYFv6uZI5lt3Oak7r6WVLLbY1rb9Z4viOtN62ijotYmaPA98TbXOQGHW4C5wmbjaKzl8h2gnqa8+/VlpHkfVGwV9k7dx396J56NmbgVvEsPWzRI81q8LC32lL5lh2O+vvtG4lHknU61Za9wK/5Ry/ntY3VlGnPJ3AduB5d/9/mqKZNeWUzTt/C5FHAfW1p0jpKfiLNNZFoqfa5O4/5BUwsz+JQNsNfFdz6MWCc/9EPKN/jZSkl+M/Fveux4FJIpfg1DLnvwK8YGaHa0Y4iuqUZ3Na/1vdYWZdRG7AL5myLWbWVR36N7Mngd0sPNoobE8RUfAXaSh3Hzezz4BBM/sIuEoE42eANnd/y93n07GPzewmke3/EtBecO7bZvY+8IGZPUoM428isv2Pu/skkVTXbWb7ieHwKXefMrP3gC/NbAvwLXGT8DSwHzjg7nPACeBnIjfgcyIX4M1VNMNl4B/gVLrO7UQC5WRO2ZvAV2kWQzXbv0J6SVE97bmK+omsO0r4E2m8d4hpd68SAXqACNA/1pT5hJjm1wecJabqHSo6sbt/CLxN5CKMElP3ngDupCKfEs/bvyB68r3pc2eIkYYO4gVF54jpgL8SNwKkRxyvALuA88SNQc9KL97dZ4CXiRyCUeDddJ3XcopPEKMYx4DBdB37MkmU9bSnSKnZ4nwkERERWc/U8xcRESkZBX8REZGSUfAXEREpGQV/ERGRklHwFxERKRkFfxERkZJR8BcRESkZBX8REZGSeQA0Kh0wybQVyAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": { + "id": "Fd_-M70F5xbl" + }, + "source": [ + "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase!" ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "iMJlTrJvLrS2" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "from sklearn.svm import LinearSVC\n", - "\n", - "vect = CountVectorizer(preprocessor=clean, max_features=1000) # Step-1\n", - "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", - "X_test_dtm = vect.transform(X_test)\n", - "\n", - "classifier = LinearSVC(class_weight='balanced') # instantiate a logistic regression model\n", - "classifier.fit(X_train_dtm, y_train) # fit the model with training data\n", - "\n", - "# Make predictions on test data\n", - "y_pred_class = classifier.predict(X_test_dtm)\n", - "\n", - "# calculate evaluation measures:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with normalization')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fd_-M70F5xbl" - }, - "source": [ - "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase! " - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "01_OnePipeline_ManyClassifiers.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Ch4/02_Doc2Vec_Example.ipynb b/Ch4/02_Doc2Vec_Example.ipynb index c1a371e..c6c63c7 100644 --- a/Ch4/02_Doc2Vec_Example.ipynb +++ b/Ch4/02_Doc2Vec_Example.ipynb @@ -1,531 +1,653 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "LCgVnQopb6TI" - }, - "source": [ - "# Doc2Vec demonstration \n", - "\n", - "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", - "\n", - "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "LCgVnQopb6TI" + }, + "source": [ + "# Doc2Vec demonstration\n", + "\n", + "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", + "\n", + "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" + ] }, - "id": "KX5dKXdcaENd", - "outputId": "956f503d-1a2c-4af1-aad5-a5da021ae29b" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting nltk==3.5\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)\n", - "\u001b[K |████████████████████████████████| 1.4MB 5.1MB/s \n", - "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (1.0.1)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (2019.12.20)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (4.41.1)\n", - "Building wheels for collected packages: nltk\n", - " Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434691 sha256=a68222bfb8c06405a2c5f264264ffa3daf49f5d73637541f961a024360751028\n", - " Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306\n", - "Successfully built nltk\n", - "Installing collected packages: nltk\n", - " Found existing installation: nltk 3.2.5\n", - " Uninstalling nltk-3.2.5:\n", - " Successfully uninstalled nltk-3.2.5\n", - "Successfully installed nltk-3.5\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting gensim==3.8.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)\n", - "\u001b[K |████████████████████████████████| 24.2MB 1.3MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.4.1)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (5.1.0)\n", - "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.15.0)\n", - "Installing collected packages: gensim\n", - " Found existing installation: gensim 3.6.0\n", - " Uninstalling gensim-3.6.0:\n", - " Successfully uninstalled gensim-3.6.0\n", - "Successfully installed gensim-3.8.3\n", - "Collecting scikit-learn==0.21.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9f/c5/e5267eb84994e9a92a2c6a6ee768514f255d036f3c8378acfa694e9f2c99/scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7MB)\n", - "\u001b[K |████████████████████████████████| 6.7MB 5.1MB/s \n", - "\u001b[?25hRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Installing collected packages: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n" - ] - } - ], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install nltk==3.5\n", - "!pip install pandas==1.1.5\n", - "!pip install gensim==3.8.3\n", - "!pip install scikit-learn==0.21.3\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "CIlwQe1S4EpL" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KX5dKXdcaENd", + "outputId": "c18e98fa-df2d-49a9-baf5-4ac23d66297c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: nltk==3.8.1 in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (2023.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (4.66.1)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gensim==4.3.1 in /usr/local/lib/python3.10/dist-packages (4.3.1)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.10.1)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (6.3.0)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install nltk==3.8.1\n", + "!pip install pandas==1.5.3\n", + "!pip install gensim==4.3.1\n", + "!pip install scikit-learn==1.2.2\n", + "\n", + "# ===========================" + ] }, - "id": "hSB6W1seb6TJ", - "outputId": "e93459c9-fd82-4d22-852b-819faeb430a6" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n" - ] - } - ], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import pandas as pd\n", - "import nltk\n", - "nltk.download('stopwords')\n", - "from nltk.tokenize import TweetTokenizer\n", - "from nltk.corpus import stopwords\n", - "from sklearn.model_selection import train_test_split\n", - "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "CIlwQe1S4EpL" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, - "id": "NGAFbmrA4EpM", - "outputId": "f78def1c-c291-4fba-dd41-f24f1456757c" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-07-16 08:27:55-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 2479133 (2.4M) [text/plain]\n", - "Saving to: ‘DATAPATH/train_data.csv’\n", - "\n", - "train_data.csv 100%[===================>] 2.36M --.-KB/s in 0.1s \n", - "\n", - "2021-07-16 08:27:55 (22.4 MB/s) - ‘DATAPATH/train_data.csv’ saved [2479133/2479133]\n", - "\n", - "--2021-07-16 08:27:55-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 783640 (765K) [text/plain]\n", - "Saving to: ‘DATAPATH/test_data.csv’\n", - "\n", - "test_data.csv 100%[===================>] 765.27K --.-KB/s in 0.05s \n", - "\n", - "2021-07-16 08:27:55 (15.4 MB/s) - ‘DATAPATH/test_data.csv’ saved [783640/783640]\n", - "\n", - "total 3.2M\n", - "drwxr-xr-x 2 root root 4.0K Jul 16 08:27 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:27 ..\n", - "-rw-r--r-- 1 root root 766K Jul 16 08:27 test_data.csv\n", - "-rw-r--r-- 1 root root 2.4M Jul 16 08:27 train_data.csv\n" - ] - } - ], - "source": [ - "#Load the dataset and explore.\n", - "try:\n", - " from google.colab import files\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", - " !ls -lah DATAPATH\n", - " filepath = \"DATAPATH/train_data.csv\"\n", - "except ModuleNotFoundError:\n", - " filepath = \"Data/Sentiment and Emotion in Text/train_data.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hSB6W1seb6TJ", + "outputId": "9e34f468-dc75-4555-9522-4fea208d6a00" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import pandas as pd\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.corpus import stopwords\n", + "from sklearn.model_selection import train_test_split\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" + ] }, - "id": "lSvnHBYPb6TQ", - "outputId": "b992755a-470e-470b-eb59-e4225711f252" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(30000, 2)\n" - ] + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGAFbmrA4EpM", + "outputId": "947b9250-7fd2-4cc7-c74c-4c88ecfdd4fa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-08-22 16:12:41-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2479133 (2.4M) [text/plain]\n", + "Saving to: ‘DATAPATH/train_data.csv.1’\n", + "\n", + "\rtrain_data.csv.1 0%[ ] 0 --.-KB/s \rtrain_data.csv.1 100%[===================>] 2.36M --.-KB/s in 0.02s \n", + "\n", + "2023-08-22 16:12:42 (131 MB/s) - ‘DATAPATH/train_data.csv.1’ saved [2479133/2479133]\n", + "\n", + "--2023-08-22 16:12:42-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 783640 (765K) [text/plain]\n", + "Saving to: ‘DATAPATH/test_data.csv.1’\n", + "\n", + "test_data.csv.1 100%[===================>] 765.27K --.-KB/s in 0.009s \n", + "\n", + "2023-08-22 16:12:42 (78.7 MB/s) - ‘DATAPATH/test_data.csv.1’ saved [783640/783640]\n", + "\n", + "total 6.3M\n", + "drwxr-xr-x 2 root root 4.0K Aug 22 16:12 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 16:10 ..\n", + "-rw-r--r-- 1 root root 766K Aug 22 16:08 test_data.csv\n", + "-rw-r--r-- 1 root root 766K Aug 22 16:12 test_data.csv.1\n", + "-rw-r--r-- 1 root root 2.4M Aug 22 16:08 train_data.csv\n", + "-rw-r--r-- 1 root root 2.4M Aug 22 16:12 train_data.csv.1\n" + ] + } + ], + "source": [ + "#Load the dataset and explore.\n", + "try:\n", + " from google.colab import files\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + " !ls -lah DATAPATH\n", + " filepath = \"DATAPATH/train_data.csv\"\n", + "except ModuleNotFoundError:\n", + " filepath = \"Data/Sentiment and Emotion in Text/train_data.csv\"" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentimentcontent
0empty@tiffanylue i know i was listenin to bad habi...
1sadnessLayin n bed with a headache ughhhh...waitin o...
2sadnessFuneral ceremony...gloomy friday...
3enthusiasmwants to hang out with friends SOON!
4neutral@dannycastillo We want to trade with someone w...
\n", - "
" + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "id": "lSvnHBYPb6TQ", + "outputId": "aaff8fc3-c5fd-457e-f0dc-19d4b412a8ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(30000, 2)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sentiment content\n", + "0 empty @tiffanylue i know i was listenin to bad habi...\n", + "1 sadness Layin n bed with a headache ughhhh...waitin o...\n", + "2 sadness Funeral ceremony...gloomy friday...\n", + "3 enthusiasm wants to hang out with friends SOON!\n", + "4 neutral @dannycastillo We want to trade with someone w..." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentimentcontent
0empty@tiffanylue i know i was listenin to bad habi...
1sadnessLayin n bed with a headache ughhhh...waitin o...
2sadnessFuneral ceremony...gloomy friday...
3enthusiasmwants to hang out with friends SOON!
4neutral@dannycastillo We want to trade with someone w...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } ], - "text/plain": [ - " sentiment content\n", - "0 empty @tiffanylue i know i was listenin to bad habi...\n", - "1 sadness Layin n bed with a headache ughhhh...waitin o...\n", - "2 sadness Funeral ceremony...gloomy friday...\n", - "3 enthusiasm wants to hang out with friends SOON!\n", - "4 neutral @dannycastillo We want to trade with someone w..." + "source": [ + "df = pd.read_csv(filepath)\n", + "print(df.shape)\n", + "df.head()" ] - }, - "execution_count": 5, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(filepath)\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "5JEI6SH7b6TU", - "outputId": "7c4bccf9-3c39-4e43-cde8-3989a7a002d0" - }, - "outputs": [ { - "data": { - "text/plain": [ - "worry 7433\n", - "neutral 6340\n", - "sadness 4828\n", - "happiness 2986\n", - "love 2068\n", - "surprise 1613\n", - "hate 1187\n", - "fun 1088\n", - "relief 1021\n", - "empty 659\n", - "enthusiasm 522\n", - "boredom 157\n", - "anger 98\n", - "Name: sentiment, dtype: int64" + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5JEI6SH7b6TU", + "outputId": "c3c1034c-6a1c-4e87-9d21-3fdc9f58c9dd" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "worry 7433\n", + "neutral 6340\n", + "sadness 4828\n", + "happiness 2986\n", + "love 2068\n", + "surprise 1613\n", + "hate 1187\n", + "fun 1088\n", + "relief 1021\n", + "empty 659\n", + "enthusiasm 522\n", + "boredom 157\n", + "anger 98\n", + "Name: sentiment, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "df['sentiment'].value_counts()" ] - }, - "execution_count": 6, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df['sentiment'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "CHajyKpmb6TY", - "outputId": "bbb05164-f107-4b7c-fedb-145a3b2d1ca3" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(16759, 2)" + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CHajyKpmb6TY", + "outputId": "9c28cf4f-87f4-4261-bd32-4e2abfd9435f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(16759, 2)" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "#Let us take the top 3 categories and leave out the rest.\n", + "shortlist = ['neutral', \"happiness\", \"worry\"]\n", + "df_subset = df[df['sentiment'].isin(shortlist)]\n", + "df_subset.shape" ] - }, - "execution_count": 7, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "#Let us take the top 3 categories and leave out the rest.\n", - "shortlist = ['neutral', \"happiness\", \"worry\"]\n", - "df_subset = df[df['sentiment'].isin(shortlist)]\n", - "df_subset.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m2oiZzU5b6Tf" - }, - "source": [ - "# Text pre-processing:\n", - "Tweets are different. Somethings to consider:\n", - "- Removing @mentions, and urls perhaps?\n", - "- using NLTK Tweet tokenizer instead of a regular one\n", - "- stopwords, numbers as usual." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "Rl-FfMdLb6Th", - "outputId": "818e0510-afdb-4732-fe69-c6119ca695c1" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "16759 16759\n" - ] - } - ], - "source": [ - "#strip_handles removes personal information such as twitter handles, which don't\n", - "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", - "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", - "mystopwords = set(stopwords.words(\"english\"))\n", - "\n", - "#Function to tokenize tweets, remove stopwords and numbers. \n", - "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", - "def preprocess_corpus(texts):\n", - " def remove_stops_digits(tokens):\n", - " #Nested function that removes stopwords and digits from a list of tokens\n", - " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", - " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", - "\n", - "#df_subset contains only the three categories we chose. \n", - "mydata = preprocess_corpus(df_subset['content'])\n", - "mycats = df_subset['sentiment']\n", - "print(len(mydata), len(mycats))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "m2oiZzU5b6Tf" + }, + "source": [ + "# Text pre-processing:\n", + "Tweets are different. Somethings to consider:\n", + "- Removing @mentions, and urls perhaps?\n", + "- using NLTK Tweet tokenizer instead of a regular one\n", + "- stopwords, numbers as usual." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Rl-FfMdLb6Th", + "outputId": "df2382f7-5823-4831-f356-85cf93d23ab6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "16759 16759\n" + ] + } + ], + "source": [ + "#strip_handles removes personal information such as twitter handles, which don't\n", + "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", + "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", + "mystopwords = set(stopwords.words(\"english\"))\n", + "\n", + "#Function to tokenize tweets, remove stopwords and numbers.\n", + "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", + "def preprocess_corpus(texts):\n", + " def remove_stops_digits(tokens):\n", + " #Nested function that removes stopwords and digits from a list of tokens\n", + " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", + " #This return statement below uses the above function to process twitter tokenizer output further.\n", + " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", + "\n", + "#df_subset contains only the three categories we chose.\n", + "mydata = preprocess_corpus(df_subset['content'])\n", + "mycats = df_subset['sentiment']\n", + "print(len(mydata), len(mycats))" + ] }, - "id": "rsGwfVebb6Tl", - "outputId": "c19bc96f-513c-45b6-d476-b95899ab7eca" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Saved\n" - ] + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rsGwfVebb6Tl", + "outputId": "0329a297-b6d9-4be1-d4c7-afa61ef786d3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model Saved\n" + ] + } + ], + "source": [ + "#Split data into train and test, following the usual process\n", + "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", + "\n", + "#prepare training data in doc2vec format:\n", + "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", + "\n", + "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", + "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", + "model.build_vocab(train_doc2vec)\n", + "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", + "model.save(\"d2v.model\")\n", + "print(\"Model Saved\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hTqo26Vsb6Ts", + "outputId": "5308af1c-a3a4-45d4-cf3d-b442fef129b7" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " happiness 0.32 0.51 0.39 713\n", + " neutral 0.45 0.53 0.49 1595\n", + " worry 0.60 0.38 0.47 1882\n", + "\n", + " accuracy 0.46 4190\n", + " macro avg 0.46 0.47 0.45 4190\n", + "weighted avg 0.50 0.46 0.46 4190\n", + "\n" + ] + } + ], + "source": [ + "#Infer the feature representation for training and test data using the trained model\n", + "model= Doc2Vec.load(\"d2v.model\")\n", + "\n", + "#infer in multiple steps to get a stable representation.\n", + "train_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]\n", + "test_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]\n", + "\n", + "#Use any regular classifier like logistic regression\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced.\n", + "myclass.fit(train_vectors, train_cats)\n", + "\n", + "preds = myclass.predict(test_vectors)\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "print(classification_report(test_cats, preds))\n", + "\n", + "#print(confusion_matrix(test_cats,preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "qcRNGUJFAA1w" + }, + "outputs": [], + "source": [] } - ], - "source": [ - "#Split data into train and test, following the usual process\n", - "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", - "\n", - "#prepare training data in doc2vec format:\n", - "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", - "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", - "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", - "model.build_vocab(train_doc2vec)\n", - "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", - "model.save(\"d2v.model\")\n", - "print(\"Model Saved\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { + ], + "metadata": { + "accelerator": "GPU", "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "hTqo26Vsb6Ts", - "outputId": "cd16346c-ca81-4dc7-c269-d9ccf83a774d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " happiness 0.37 0.45 0.41 713\n", - " neutral 0.46 0.53 0.49 1595\n", - " worry 0.58 0.46 0.51 1882\n", - "\n", - " accuracy 0.48 4190\n", - " macro avg 0.47 0.48 0.47 4190\n", - "weighted avg 0.50 0.48 0.49 4190\n", - "\n" - ] + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "#Infer the feature representation for training and test data using the trained model\n", - "model= Doc2Vec.load(\"d2v.model\")\n", - "#infer in multiple steps to get a stable representation. \n", - "train_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_data]\n", - "test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_data]\n", - "\n", - "#Use any regular classifier like logistic regression\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced. \n", - "myclass.fit(train_vectors, train_cats)\n", - "\n", - "preds = myclass.predict(test_vectors)\n", - "from sklearn.metrics import classification_report, confusion_matrix\n", - "print(classification_report(test_cats, preds))\n", - "\n", - "#print(confusion_matrix(test_cats,preds))" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "02_Doc2Vec_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index 20ec9b5..7ac82f1 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -1,31 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "03_Word2Vec_Example.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, "cells": [ { "cell_type": "markdown", @@ -52,100 +25,126 @@ }, { "cell_type": "code", + "source": [ + "import pkg_resources\n", + "\n", + "def get_library_versions(library_list):\n", + " frozen_list = []\n", + "\n", + " for library in library_list:\n", + " try:\n", + " version = pkg_resources.get_distribution(library).version\n", + " frozen_list.append(f\"{library}=={version}\")\n", + " except pkg_resources.DistributionNotFound:\n", + " print(f\"Error: {library} not found or could not retrieve version.\")\n", + "\n", + " return frozen_list\n", + "\n", + "# List of library names\n", + "libraries = [\"numpy\", \"pandas\", \"gensim\", \"nltk\", \"scikit-learn\", \"gdown\"]\n", + "\n", + "# Get frozen list of library versions\n", + "frozen_versions = get_library_versions(libraries)\n", + "\n", + "# Print the frozen list\n", + "for item in frozen_versions:\n", + " print(item)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KN4IKdaFCH7c", + "outputId": "ad4e3a11-17ce-4049-8ce9-53972fe41bfb" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "numpy==1.23.5\n", + "pandas==1.5.3\n", + "gensim==4.3.1\n", + "nltk==3.8.1\n", + "scikit-learn==1.2.2\n", + "gdown==4.6.6\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "77UP8YyEdS2W", - "outputId": "1bb0a097-0232-42fd-ec29-b2e96ce857f5" + "outputId": "c7203e6a-e19d-4e9a-f577-ae936a2e1a4d" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.10/dist-packages (1.23.5)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gensim==4.3.1 in /usr/local/lib/python3.10/dist-packages (4.3.1)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.10.1)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (6.3.0)\n", + "Requirement already satisfied: nltk==3.8.1 in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (2023.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (4.66.1)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n", + "Requirement already satisfied: gdown==4.6.6 in /usr/local/lib/python3.10/dist-packages (4.6.6)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (3.12.2)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (2.31.0)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (1.16.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.66.1)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.11.2)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown==4.6.6) (2.4.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2023.7.22)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (1.7.1)\n" + ] + } + ], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install gensim==3.8.3\n", - "!pip install wget==3.2\n", - "!pip install nltk==3.5\n", - "!pip install scikit-learn==0.21.3\n", + "!pip install numpy==1.23.5\n", + "!pip install pandas==1.5.3\n", + "!pip install gensim==4.3.1\n", + "!pip install nltk==3.8.1\n", + "!pip install scikit-learn==1.2.2\n", + "!pip install gdown==4.6.6\n", "\n", "# ===========================" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting gensim==3.8.3\n", - " Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)\n", - "\u001b[K |████████████████████████████████| 24.2 MB 84.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.19.5)\n", - "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.15.0)\n", - "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.4.1)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (5.1.0)\n", - "Installing collected packages: gensim\n", - " Attempting uninstall: gensim\n", - " Found existing installation: gensim 3.6.0\n", - " Uninstalling gensim-3.6.0:\n", - " Successfully uninstalled gensim-3.6.0\n", - "Successfully installed gensim-3.8.3\n", - "Collecting wget==3.2\n", - " Downloading wget-3.2.zip (10 kB)\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9673 sha256=4877de9e41ccfba395a6bc044ccad7ba2ea4f6324ca63bbf9da41b644eb8efea\n", - " Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting nltk==3.5\n", - " Downloading nltk-3.5.zip (1.4 MB)\n", - "\u001b[K |████████████████████████████████| 1.4 MB 14.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (1.0.1)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (2019.12.20)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (4.41.1)\n", - "Building wheels for collected packages: nltk\n", - " Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434691 sha256=efc90917aca010ac50551beb55d51252d0b46e103b87d83a9b66c70d6b6fd4ba\n", - " Stored in directory: /root/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266\n", - "Successfully built nltk\n", - "Installing collected packages: nltk\n", - " Attempting uninstall: nltk\n", - " Found existing installation: nltk 3.2.5\n", - " Uninstalling nltk-3.2.5:\n", - " Successfully uninstalled nltk-3.2.5\n", - "Successfully installed nltk-3.5\n", - "Collecting scikit-learn==0.21.3\n", - " Downloading scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)\n", - "\u001b[K |████████████████████████████████| 6.7 MB 14.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Installing collected packages: scikit-learn\n", - " Attempting uninstall: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 4, "metadata": { "id": "URLGvBLv9T0M" }, + "outputs": [], "source": [ "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", "\n", @@ -158,25 +157,35 @@ "# !pip install -r \"ch4-requirements.txt\"\n", "\n", "# ===========================" - ], - "execution_count": 2, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JQX8DAmBb_Hr", - "outputId": "a89dcee7-f76f-4bd9-ba60-8642b88ab50c" + "outputId": "65dc3618-5f7b-41a9-88bf-a5090b32f270" }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ], "source": [ "#basic imports\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "import os\n", - "import wget\n", "import gzip\n", "import shutil\n", "from time import time\n", @@ -194,57 +203,30 @@ "from gensim.models import Word2Vec, KeyedVectors\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import classification_report" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Unzipping tokenizers/punkt.zip.\n" - ], - "name": "stderr" - } + "from sklearn.metrics import classification_report\n", + "\n", + "#google-drive download imports\n", + "import gdown" ] }, { "cell_type": "code", + "execution_count": 9, "metadata": { - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "" - } - }, - "base_uri": "https://localhost:8080/", - "height": 140 - }, - "id": "S8RM8c6AS8AX", - "outputId": "0b366a76-49b0-4170-dce6-33572a37a929" + "id": "S8RM8c6AS8AX" }, + "outputs": [], "source": [ "try:\n", " from google.colab import files\n", - " \n", + "\n", " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in \"sentiment labelled sentences\" folder\n", " uploaded = files.upload()\n", - " \n", + "\n", " !mkdir DATAPATH\n", " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", - " \n", + "\n", "except ModuleNotFoundError:\n", "\n", " fil = 'sentiment_sentences.txt'\n", @@ -252,7 +234,7 @@ " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", " file = open(os.path.join(path, fil), 'w')\n", " file.close()\n", - " \n", + "\n", " # combined the three files to make sentiment_sentences.txt\n", " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", "\n", @@ -263,88 +245,95 @@ " print(\"File created\")\n", " else:\n", " print(\"File already exists\")" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt\n", - "Saving imdb_labelled.txt to imdb_labelled.txt\n", - "Saving yelp_labelled.txt to yelp_labelled.txt\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "COUGXAxcb_H5", - "scrolled": true, - "outputId": "b88ee64f-6c36-412e-ce57-f9387eec3051" + "outputId": "640ac771-0389-4640-ab79-a11f26bd2c29", + "scrolled": true }, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", + "To: /content/DATAPATH/GoogleNews-vectors-negative300.bin.gz\n", + "100% 1.65G/1.65G [00:15<00:00, 103MB/s]\n", + "CPU times: user 28.1 s, sys: 3.92 s, total: 32.1 s\n", + "Wall time: 34.4 s\n", + "done loading Word2Vec\n" + ] + } + ], "source": [ "#Load the pre-trained word2vec model and the dataset\n", + "\n", + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_data(location: str) -> None:\n", + " with gzip.open(os.path.join(location, 'GoogleNews-vectors-negative300.bin.gz'), 'rb') as f_in:\n", + " with open(os.path.join('./Data', './GoogleNews-vectors-negative300.bin'), 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + "\n", "try:\n", - " \n", " from google.colab import files\n", " data_path= \"DATAPATH\"\n", - " !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", - " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n", + " !gdown -O DATAPATH/ https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\n", + " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz\n", " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", - " \n", + "\n", "except ModuleNotFoundError:\n", - " \n", - " data_path= \"Data\"\n", - " \n", - " if not os.path.exists('GoogleNews-vectors-negative300.bin'):\n", - " if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n", - " if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n", - " wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n", - "\n", - " with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:\n", - " with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:\n", - " shutil.copyfileobj(f_in, f_out)\n", - "\n", - " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", - " else:\n", - " path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n", "\n", - " else:\n", - " path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n", + " data_path = './Data/'\n", + " compressed_file_name = 'GoogleNews-vectors-negative300.bin.gz'\n", + " extracted_file_name = 'GoogleNews-vectors-negative300.bin'\n", + "\n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + "\n", + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + "\n", " else:\n", - " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", - " \n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + "\n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_data(os.path.join(location_of_compressed_file))\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " else:\n", + " # Download File\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_data(output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")\n", " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", - " \n", - " \n", - "#Load W2V model. This will take some time. \n", + "\n", + "\n", + "#Load W2V model. This will take some time.\n", "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", "print('done loading Word2Vec')\n", "\n", @@ -357,95 +346,85 @@ " text, sentiment = line.split(\"\\t\")\n", " texts.append(text)\n", " cats.append(sentiment)" - ], - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "text": [ - "--2021-07-20 08:36:30-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.130.248\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.130.248|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1647046227 (1.5G) [application/x-gzip]\n", - "Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’\n", - "\n", - "GoogleNews-vectors- 100%[===================>] 1.53G 35.4MB/s in 46s \n", - "\n", - "2021-07-20 08:37:16 (34.1 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n", - "\n", - "CPU times: user 19.6 s, sys: 3.11 s, total: 22.7 s\n", - "Wall time: 35.2 s\n", - "done loading Word2Vec\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "m-WjFyC6b_IE", - "outputId": "87270b42-96b9-4420-f22a-6f13160e5cbe" + "outputId": "7cb1a092-d3fa-4bf4-e437-6d079da7ed74" }, - "source": [ - "#Inspect the model\n", - "word2vec_vocab = w2v_model.vocab.keys()\n", - "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", - "print(len(word2vec_vocab))" - ], - "execution_count": 6, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "3000000\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#Inspect the model\n", + "word2vec_vocab = w2v_model.key_to_index.keys()\n", + "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", + "print(len(word2vec_vocab))" ] }, { "cell_type": "code", + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XEz30Jztb_IP", - "outputId": "18794f4b-828f-4c7c-9708-b9af3143d700" + "outputId": "7c37e0e1-9f2e-411b-cdac-b89ecc39a0ea" }, - "source": [ - "#Inspect the dataset\n", - "print(len(cats), len(texts))\n", - "print(texts[1])\n", - "print(cats[1])" - ], - "execution_count": 7, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "3000 3000\n", "Good case, Excellent value.\n", "1\n", "\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#Inspect the dataset\n", + "print(len(cats), len(texts))\n", + "print(texts[1])\n", + "print(cats[1])" ] }, { "cell_type": "code", + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MFOGaDTwb_Ig", - "outputId": "b9983e21-f00e-4c3e-ebe4-e2c8be738398" + "outputId": "4e50a4a9-1f40-429c-c7b3-e445e42cae6f" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000 3000\n", + "['good', 'case', 'excellent', 'value']\n", + "1\n", + "\n" + ] + } + ], "source": [ "#preprocess the text.\n", "def preprocess_corpus(texts):\n", @@ -454,37 +433,34 @@ " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", " and token not in punctuation]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", + " #This return statement below uses the above function to process twitter tokenizer output further.\n", " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n", "\n", "texts_processed = preprocess_corpus(texts)\n", "print(len(cats), len(texts_processed))\n", "print(texts_processed[1])\n", "print(cats[1])" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000 3000\n", - "['good', 'case', 'excellent', 'value']\n", - "1\n", - "\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fXRiGtY1b_Iq", - "outputId": "fdba211b-e6bd-453e-b70d-79546d6ef005" + "outputId": "1f5eaad6-939e-46fc-cf27-21798f78e18f" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000\n" + ] + } + ], "source": [ "# Creating a feature vector by averaging all embeddings for all sentences\n", "def embedding_feats(list_of_lists):\n", @@ -493,13 +469,13 @@ " feats = []\n", " for tokens in list_of_lists:\n", " feat_for_this = np.zeros(DIMENSION)\n", - " count_for_this = 0 + 1e-5 # to avoid divide-by-zero \n", + " count_for_this = 0 + 1e-5 # to avoid divide-by-zero\n", " for token in tokens:\n", " if token in w2v_model:\n", " feat_for_this += w2v_model[token]\n", " count_for_this +=1\n", " if(count_for_this!=0):\n", - " feats.append(feat_for_this/count_for_this) \n", + " feats.append(feat_for_this/count_for_this)\n", " else:\n", " feats.append(zero_vector)\n", " return feats\n", @@ -507,56 +483,46 @@ "\n", "train_vectors = embedding_feats(texts_processed)\n", "print(len(train_vectors))" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mr9IaQppb_Ix", - "outputId": "2c372ab4-38d8-4884-99dc-9bb3bbba16d0" + "outputId": "c74d84ea-6586-4d68-c8f7-e2e36b7f915d" }, - "source": [ - "#Take any classifier (LogisticRegression here, and train/test it like before.\n", - "classifier = LogisticRegression(random_state=1234)\n", - "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", - "classifier.fit(train_data, train_cats)\n", - "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", - "preds = classifier.predict(test_data)\n", - "print(classification_report(test_cats, preds))" - ], - "execution_count": 10, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ - "Accuracy: 0.8453333333333334\n", + "Accuracy: 0.8013333333333333\n", " precision recall f1-score support\n", "\n", " 0\n", - " 0.87 0.83 0.85 388\n", + " 0.77 0.83 0.80 353\n", " 1\n", - " 0.82 0.86 0.84 362\n", + " 0.84 0.78 0.81 397\n", "\n", - " accuracy 0.85 750\n", - " macro avg 0.85 0.85 0.85 750\n", - "weighted avg 0.85 0.85 0.85 750\n", + " accuracy 0.80 750\n", + " macro avg 0.80 0.80 0.80 750\n", + "weighted avg 0.80 0.80 0.80 750\n", "\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#Take any classifier (LogisticRegression here, and train/test it like before.\n", + "classifier = LogisticRegression(random_state=1234)\n", + "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", + "classifier.fit(train_data, train_cats)\n", + "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", + "preds = classifier.predict(test_data)\n", + "print(classification_report(test_cats, preds))" ] }, { @@ -565,8 +531,42 @@ "id": "k7wjLB8rb_JB" }, "source": [ - "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!" + "Not bad. With little efforts we got 80% accuracy. Thats a great starting model to have!!" ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "h4lF7mkPCAuy" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb index b04f9d4..219a8ae 100644 --- a/Ch4/04_FastText_Example.ipynb +++ b/Ch4/04_FastText_Example.ipynb @@ -1,571 +1,729 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "6FIToZHAhz2O" - }, - "source": [ - "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", - "**Note**: This notebook uses an older version of fasttext." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6FIToZHAhz2O" + }, + "source": [ + "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", + "**Note**: This notebook uses an older version of fasttext." + ] }, - "id": "UBnT5t_LiCU2", - "outputId": "ca0bcea9-75a7-4237-e58e-154c3d72e89f" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting wget==3.2\n", - " Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=0e1e014b6bf086637aea4bfe15707b7d8d825e7280cd2f9c6ec1943ef00e80c7\n", - " Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting fasttext==0.9.2\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)\n", - "\u001b[K |████████████████████████████████| 71kB 6.8MB/s \n", - "\u001b[?25hRequirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (2.6.2)\n", - "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (57.0.0)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (1.19.5)\n", - "Building wheels for collected packages: fasttext\n", - " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3091748 sha256=f30effec512519a72b11f0eaf7aa8a6b57df1643345f8e51bf7b1cb010552792\n", - " Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592\n", - "Successfully built fasttext\n", - "Installing collected packages: fasttext\n", - "Successfully installed fasttext-0.9.2\n" - ] - } - ], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install pandas==1.1.5\n", - "!pip install wget==3.2\n", - "!pip install fasttext==0.9.2\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "zrBi6bvbiCU4" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "YKgZXvTGb61z" - }, - "outputs": [], - "source": [ - "#necessary imports\n", - "import os\n", - "import pandas as pd\n", - "import wget\n", - "import tarfile" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UBnT5t_LiCU2", + "outputId": "c63a8ae7-5816-486c-b161-1a597cff909f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gdown==4.6.6 in /usr/local/lib/python3.10/dist-packages (4.6.6)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (3.12.2)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (2.31.0)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (1.16.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.66.1)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.11.2)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown==4.6.6) (2.4.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2023.7.22)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (1.7.1)\n", + "Requirement already satisfied: fasttext==0.9.2 in /usr/local/lib/python3.10/dist-packages (0.9.2)\n", + "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (2.11.1)\n", + "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (67.7.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (1.23.5)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install pandas==1.5.3\n", + "!pip install gdown==4.6.6\n", + "!pip install fasttext==0.9.2\n", + "\n", + "# ===========================" + ] }, - "id": "l6CfW7C3L4EB", - "outputId": "debf3639-77d2-4a2c-8aa1-3ff8438b9585" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-07-16 08:57:35-- https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "Resolving github.com (github.com)... 140.82.121.4\n", - "Connecting to github.com (github.com)|140.82.121.4|:443... connected.\n", - "HTTP request sent, awaiting response... 301 Moved Permanently\n", - "Location: https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz [following]\n", - "--2021-07-16 08:57:35-- https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "Reusing existing connection to github.com:443.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz [following]\n", - "--2021-07-16 08:57:35-- https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 68431223 (65M) [application/octet-stream]\n", - "Saving to: ‘DATAPATH/dbpedia_csv.tar.gz’\n", - "\n", - "dbpedia_csv.tar.gz 100%[===================>] 65.26M 206MB/s in 0.3s \n", - "\n", - "2021-07-16 08:57:42 (206 MB/s) - ‘DATAPATH/dbpedia_csv.tar.gz’ saved [68431223/68431223]\n", - "\n", - "dbpedia_csv/\n", - "dbpedia_csv/test.csv\n", - "dbpedia_csv/classes.txt\n", - "dbpedia_csv/train.csv\n", - "dbpedia_csv/readme.txt\n", - "total 66M\n", - "drwxr-xr-x 3 root root 4.0K Jul 16 08:57 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:57 ..\n", - "drwxrwxr-x 2 1000 1000 4.0K Mar 29 2015 dbpedia_csv\n", - "-rw-r--r-- 1 root root 66M Jul 16 08:57 dbpedia_csv.tar.gz\n" - ] - } - ], - "source": [ - "try :\n", - " \n", - " from google.colab import files\n", - " \n", - " # downloading the data\n", - " !wget -P DATAPATH https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "\n", - " # untaring the required file\n", - " !tar -xvf DATAPATH/dbpedia_csv.tar.gz -C DATAPATH\n", - "\n", - " # sneek peek in the folder structure\n", - " !ls -lah DATAPATH\n", - " \n", - " # specifying the data_path\n", - " data_path = 'DATAPATH'\n", - " \n", - "except ModuleNotFoundError:\n", - " \n", - " if not os.path.exists(os.getcwd()+'\\\\Data\\\\dbpedia_csv') :\n", - " # downloading the data\n", - " url=\"https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\"\n", - " path=os.getcwd()+'\\Data'\n", - " wget.download(url,path)\n", - "\n", - " # untaring the required file\n", - " temp=path+'\\dbpedia_csv.tar.gz'\n", - " tar = tarfile.open(temp, \"r:gz\")\n", - " tar.extractall(path) \n", - " tar.close()\n", - " \n", - " # specifying the data_path\n", - " data_path='Data'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "zrBi6bvbiCU4" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, - "id": "lMoRw3oQb62I", - "outputId": "744d1cb7-4966-4db1-b176-c2020975ed94" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train:(560000, 3) Test:(70000, 3)\n" - ] - } - ], - "source": [ - "# Loading train data\n", - "train_file = data_path + '/dbpedia_csv/train.csv'\n", - "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", - "# Loading test data\n", - "test_file = data_path + '/dbpedia_csv/test.csv'\n", - "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", - "# Data we have\n", - "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "YKgZXvTGb61z" + }, + "outputs": [], + "source": [ + "#necessary imports\n", + "import os\n", + "import pandas as pd\n", + "import tarfile\n", + "import gdown" + ] }, - "id": "gaz226vXb62W", - "outputId": "a7e5ab41-732e-4a94-def6-5e62124d6bd5" - }, - "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", - "
" + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "l6CfW7C3L4EB", + "outputId": "6e30eb61-0cdc-4616-d14e-46888017cac9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\n", + "To: /content/DATAPATH/dbpedia_csv.tar.gz\n", + "100% 68.3M/68.3M [00:00<00:00, 164MB/s]\n", + "dbpedia_csv/\n", + "dbpedia_csv/classes.txt\n", + "dbpedia_csv/test.csv\n", + "dbpedia_csv/train.csv\n", + "dbpedia_csv/readme.txt\n", + "total 66M\n", + "drwxr-xr-x 3 root root 4.0K Aug 22 18:17 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 18:17 ..\n", + "drwxrwxr-x 2 3666 11555 4.0K Sep 9 2015 dbpedia_csv\n", + "-rw-r--r-- 1 root root 66M Aug 22 18:17 dbpedia_csv.tar.gz\n" + ] + } ], - "text/plain": [ - " class ... class_name\n", - "0 1 ... Company\n", - "1 1 ... Company\n", - "2 1 ... Company\n", - "3 1 ... Company\n", - "4 1 ... Company\n", - "\n", - "[5 rows x 4 columns]" + "source": [ + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_tar_file(file_path: str, extraction_path: str) -> None:\n", + " tar = tarfile.open(file_path, \"r:gz\")\n", + " tar.extractall(extraction_path)\n", + " tar.close()\n", + "\n", + "try :\n", + "\n", + " from google.colab import files\n", + "\n", + " # specifying the data_path\n", + " data_path = \"./DATAPATH\"\n", + "\n", + " !mkdir ./DATAPATH\n", + "\n", + " # downloading the data\n", + " !gdown -O ./DATAPATH/dbpedia_csv.tar.gz \"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\"\n", + "\n", + " # untaring the required file\n", + " !tar -xvf ./DATAPATH/dbpedia_csv.tar.gz --directory ./DATAPATH\n", + "\n", + " # sneek peek in the folder structure\n", + " !ls -lah ./DATAPATH\n", + "\n", + "except ModuleNotFoundError:\n", + " data_path = './Data/'\n", + " compressed_file_name = 'dbpedia_csv.tar.gz'\n", + " extracted_file_name = 'dbpedia_csv'\n", + "\n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data'])\n", + "\n", + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + "\n", + " else:\n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data'])\n", + "\n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_tar_file(os.path.join(location_of_compressed_file, compressed_file_name), data_path)\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " else:\n", + " # Download File\n", + " os.makedirs(\"./Data\", exist_ok=True)\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_tar_file(os.path.join(data_path, compressed_file_name), output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")" ] - }, - "execution_count": 6, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "# Since we have no clue about the classes lets build one\n", - "# Mapping from class number to class name\n", - "class_dict={\n", - " 1:'Company',\n", - " 2:'EducationalInstitution',\n", - " 3:'Artist',\n", - " 4:'Athlete',\n", - " 5:'OfficeHolder',\n", - " 6:'MeanOfTransportation',\n", - " 7:'Building',\n", - " 8:'NaturalPlace',\n", - " 9:'Village',\n", - " 10:'Animal',\n", - " 11:'Plant',\n", - " 12:'Album',\n", - " 13:'Film',\n", - " 14:'WrittenWork'\n", - " }\n", - "\n", - "# Mapping the classes\n", - "df['class_name'] = df['class'].map(class_dict)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "si7VC_Rub62a", - "outputId": "a1f7d406-0e9c-4adf-eaee-fc09572f27bf" - }, - "outputs": [ { - "data": { - "text/plain": [ - "Athlete 40000\n", - "MeanOfTransportation 40000\n", - "Film 40000\n", - "Artist 40000\n", - "Building 40000\n", - "Company 40000\n", - "Plant 40000\n", - "Album 40000\n", - "NaturalPlace 40000\n", - "Village 40000\n", - "EducationalInstitution 40000\n", - "Animal 40000\n", - "WrittenWork 40000\n", - "OfficeHolder 40000\n", - "Name: class_name, dtype: int64" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lMoRw3oQb62I", + "outputId": "61ef0d52-044d-4829-db25-796f9ae2d562" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train:(560000, 3) Test:(70000, 3)\n" + ] + } + ], + "source": [ + "# Loading train data\n", + "train_file = data_path + '/dbpedia_csv/train.csv'\n", + "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", + "# Loading test data\n", + "test_file = data_path + '/dbpedia_csv/test.csv'\n", + "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", + "# Data we have\n", + "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))" ] - }, - "execution_count": 7, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"class_name\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "Sn-3kIqMb62d" - }, - "outputs": [], - "source": [ - "# Lets do some cleaning of this text\n", - "def clean_it(text,normalize=True):\n", - " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", - " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", - " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", - " \n", - " # normalizing / encoding the text\n", - " if normalize:\n", - " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", - " \n", - " return s\n", - "\n", - "# Now lets define a small function where we can use above cleaning on datasets\n", - "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", - " # Defining the new data\n", - " df = data[['name','description']].copy(deep=True)\n", - " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", - " \n", - " # cleaning it\n", - " if cleanit:\n", - " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", - " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", - " \n", - " # shuffling it\n", - " if shuffleit:\n", - " df.sample(frac=1).reset_index(drop=True)\n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "r_DRvdFcb62m", - "outputId": "d3fc1348-fcb2-4f50-c090-067e5ca66301" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4.38 s, sys: 193 ms, total: 4.57 s\n", - "Wall time: 4.63 s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Transform the datasets using the above clean functions\n", - "df_train_cleaned = clean_df(df, True, True)\n", - "df_test_cleaned = clean_df(df_test, True, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "imMZ9-Bkb62t" - }, - "outputs": [], - "source": [ - "# Write files to disk as fastText classifier API reads files from disk.\n", - "train_file = data_path + '/dbpedia_train.csv'\n", - "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", - "\n", - "test_file = data_path + '/dbpedia_test.csv'\n", - "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bWZTSzd9b62x" - }, - "source": [ - "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "gaz226vXb62W", + "outputId": "ba8cecca-4c1b-41ae-f726-27f6bbef243f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " class name \\\n", + "0 1 E. D. Abbott Ltd \n", + "1 1 Schwan-Stabilo \n", + "2 1 Q-workshop \n", + "3 1 Marvell Software Solutions Israel \n", + "4 1 Bergan Mercy Medical Center \n", + "\n", + " description class_name \n", + "0 Abbott of Farnham E D Abbott Limited was a Br... Company \n", + "1 Schwan-STABILO is a German maker of pens for ... Company \n", + "2 Q-workshop is a Polish company located in Poz... Company \n", + "3 Marvell Software Solutions Israel known as RA... Company \n", + "4 Bergan Mercy Medical Center is a hospital loc... Company " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# Since we have no clue about the classes lets build one\n", + "# Mapping from class number to class name\n", + "class_dict={\n", + " 1:'Company',\n", + " 2:'EducationalInstitution',\n", + " 3:'Artist',\n", + " 4:'Athlete',\n", + " 5:'OfficeHolder',\n", + " 6:'MeanOfTransportation',\n", + " 7:'Building',\n", + " 8:'NaturalPlace',\n", + " 9:'Village',\n", + " 10:'Animal',\n", + " 11:'Plant',\n", + " 12:'Album',\n", + " 13:'Film',\n", + " 14:'WrittenWork'\n", + " }\n", + "\n", + "# Mapping the classes\n", + "df['class_name'] = df['class'].map(class_dict)\n", + "df.head()" + ] }, - "id": "a-H1wouCb62x", - "outputId": "3d7c130a-fd3b-472c-8585-2e965017763f" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1h 3min 10s, sys: 12.8 s, total: 1h 3min 23s\n", - "Wall time: 32min 17s\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "si7VC_Rub62a", + "outputId": "dd8c0c6e-fce9-4362-abab-45d9c751858c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Company 40000\n", + "EducationalInstitution 40000\n", + "Artist 40000\n", + "Athlete 40000\n", + "OfficeHolder 40000\n", + "MeanOfTransportation 40000\n", + "Building 40000\n", + "NaturalPlace 40000\n", + "Village 40000\n", + "Animal 40000\n", + "Plant 40000\n", + "Album 40000\n", + "Film 40000\n", + "WrittenWork 40000\n", + "Name: class_name, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "df[\"class_name\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Sn-3kIqMb62d" + }, + "outputs": [], + "source": [ + "# Lets do some cleaning of this text\n", + "def clean_it(text,normalize=True):\n", + " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", + " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", + " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", + "\n", + " # normalizing / encoding the text\n", + " if normalize:\n", + " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", + "\n", + " return s\n", + "\n", + "# Now lets define a small function where we can use above cleaning on datasets\n", + "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", + " # Defining the new data\n", + " df = data[['name','description']].copy(deep=True)\n", + " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", + "\n", + " # cleaning it\n", + " if cleanit:\n", + " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", + " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", + "\n", + " # shuffling it\n", + " if shuffleit:\n", + " df.sample(frac=1).reset_index(drop=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r_DRvdFcb62m", + "outputId": "75d06ea5-e04f-4c03-a6c4-37fdc4e10442" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 3.78 s, sys: 220 ms, total: 4 s\n", + "Wall time: 4.15 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Transform the datasets using the above clean functions\n", + "df_train_cleaned = clean_df(df, True, True)\n", + "df_test_cleaned = clean_df(df_test, True, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "imMZ9-Bkb62t" + }, + "outputs": [], + "source": [ + "# Write files to disk as fastText classifier API reads files from disk.\n", + "train_file = data_path + '/dbpedia_train.csv'\n", + "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", + "\n", + "test_file = data_path + '/dbpedia_test.csv'\n", + "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bWZTSzd9b62x" + }, + "source": [ + "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "a-H1wouCb62x", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bdb18719-1b63-4c76-f1cb-58e93717fbd2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 1h, sys: 19 s, total: 1h 19s\n", + "Wall time: 33min 36s\n" + ] + } + ], + "source": [ + "%%time\n", + "## Using fastText for feature extraction and training\n", + "from fasttext import train_supervised\n", + "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", + "label_prefix refers to the prefix before label string in the dataset.\n", + "default is __label__. In our dataset, it is __class__.\n", + "There are several other parameters which can be seen in:\n", + "https://pypi.org/project/fasttext/\n", + "\"\"\"\n", + "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "sAyN3ZDbQFq-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7aec83c4-251d-402e-e10d-1f926de4c64f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Samples: 70000 Precision@1 : 91.5214 Recall@1 : 91.5214\n", + "Test Samples: 70000 Precision@2 : 47.6493 Recall@2 : 95.2986\n", + "Test Samples: 70000 Precision@3 : 31.9848 Recall@3 : 95.9543\n", + "Test Samples: 70000 Precision@4 : 24.2014 Recall@4 : 96.8057\n", + "Test Samples: 70000 Precision@5 : 19.4149 Recall@5 : 97.0743\n" + ] + } + ], + "source": [ + "for k in range(1,6):\n", + " results = model.test(test_file,k=k)\n", + " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nrxSYRs3b621" + }, + "source": [ + "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 90% Precision and Recall are hard numbers to beat, too!" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "AHSqt1rLd-R0" + }, + "execution_count": null, + "outputs": [] } - ], - "source": [ - "%%time\n", - "## Using fastText for feature extraction and training\n", - "from fasttext import train_supervised \n", - "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", - "label_prefix refers to the prefix before label string in the dataset.\n", - "default is __label__. In our dataset, it is __class__. \n", - "There are several other parameters which can be seen in: \n", - "https://pypi.org/project/fasttext/\n", - "\"\"\"\n", - "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { + ], + "metadata": { + "accelerator": "GPU", "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "sAyN3ZDbQFq-", - "outputId": "13acbc62-48d9-469c-dfb1-d3e5446b8530" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Samples: 70000 Precision@1 : 92.2486 Recall@1 : 92.2486\n", - "Test Samples: 70000 Precision@2 : 48.5014 Recall@2 : 97.0029\n", - "Test Samples: 70000 Precision@3 : 32.5619 Recall@3 : 97.6857\n", - "Test Samples: 70000 Precision@4 : 24.4968 Recall@4 : 97.9871\n", - "Test Samples: 70000 Precision@5 : 19.6420 Recall@5 : 98.2100\n" - ] + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "for k in range(1,6):\n", - " results = model.test(test_file,k=k)\n", - " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nrxSYRs3b621" - }, - "source": [ - "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "04_FastText_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Ch4/05_DeepNN_Example.ipynb b/Ch4/05_DeepNN_Example.ipynb index a9c7abb..7cbdaee 100644 --- a/Ch4/05_DeepNN_Example.ipynb +++ b/Ch4/05_DeepNN_Example.ipynb @@ -1,31 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "05_DeepNN_Example.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, "cells": [ { "cell_type": "markdown", @@ -33,98 +6,35 @@ "id": "aLNg_Puse6EX" }, "source": [ - "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. " + "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset." ] }, { "cell_type": "code", + "execution_count": 6, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eOJLveJqtEO3", - "outputId": "067a74b2-c5df-464d-a3fa-3f4517a9090a" + "id": "eOJLveJqtEO3" }, + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install wget==3.2\n", - "!pip install tensorflow==1.14.0\n", + "# !pip install numpy==1.23.5\n", + "# !pip install wget==3.2\n", + "# !pip install tensorflow==2.12.0\n", "\n", "# ===========================" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Collecting wget==3.2\n", - " Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=0590de33e3a5654cc81a0a21cf66fa3e8af32bf31e65c5a543d101b6d3fba858\n", - " Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting tensorflow==1.14.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/28/96efba1a516cdacc2e2d6d081f699c001d414cc8ca3250e6d59ae657eb2b/tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (109.3MB)\n", - "\u001b[K |████████████████████████████████| 109.3MB 104kB/s \n", - "\u001b[?25hRequirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.12.1)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.34.1)\n", - "Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)\n", - "\u001b[K |████████████████████████████████| 491kB 49.6MB/s \n", - "\u001b[?25hRequirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.0)\n", - "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.19.5)\n", - "Collecting tensorboard<1.15.0,>=1.14.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)\n", - "\u001b[K |████████████████████████████████| 3.2MB 33.6MB/s \n", - "\u001b[?25hRequirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.2)\n", - "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.8.1)\n", - "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.2.0)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.12.0)\n", - "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.15.0)\n", - "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (3.17.3)\n", - "Collecting keras-applications>=1.0.6\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)\n", - "\u001b[K |████████████████████████████████| 51kB 8.4MB/s \n", - "\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.4.0)\n", - "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.36.2)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (57.0.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.3.4)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (1.0.1)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from keras-applications>=1.0.6->tensorflow==1.14.0) (3.1.0)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (4.6.1)\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->keras-applications>=1.0.6->tensorflow==1.14.0) (1.5.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.7.4.3)\n", - "\u001b[31mERROR: kapre 0.3.5 has requirement tensorflow>=2.0.0, but you'll have tensorflow 1.14.0 which is incompatible.\u001b[0m\n", - "Installing collected packages: tensorflow-estimator, tensorboard, keras-applications, tensorflow\n", - " Found existing installation: tensorflow-estimator 2.5.0\n", - " Uninstalling tensorflow-estimator-2.5.0:\n", - " Successfully uninstalled tensorflow-estimator-2.5.0\n", - " Found existing installation: tensorboard 2.5.0\n", - " Uninstalling tensorboard-2.5.0:\n", - " Successfully uninstalled tensorboard-2.5.0\n", - " Found existing installation: tensorflow 2.5.0\n", - " Uninstalling tensorflow-2.5.0:\n", - " Successfully uninstalled tensorflow-2.5.0\n", - "Successfully installed keras-applications-1.0.8 tensorboard-1.14.0 tensorflow-1.14.0 tensorflow-estimator-1.14.0\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 7, "metadata": { "id": "Ixb_5zcYtEO5" }, + "outputs": [], "source": [ "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", "\n", @@ -137,15 +47,15 @@ "# !pip install -r \"ch4-requirements.txt\"\n", "\n", "# ===========================" - ], - "execution_count": 2, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 8, "metadata": { "id": "xqUcb7NBb5--" }, + "outputs": [], "source": [ "#Make the necessary imports\n", "import os\n", @@ -154,7 +64,7 @@ "import tarfile\n", "import wget\n", "import warnings\n", - "warnings.filterwarnings(\"ignore\") \n", + "warnings.filterwarnings(\"ignore\")\n", "from zipfile import ZipFile\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", @@ -163,9 +73,7 @@ "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n", "from tensorflow.keras.models import Model, Sequential\n", "from tensorflow.keras.initializers import Constant" - ], - "execution_count": 3, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -178,84 +86,83 @@ }, { "cell_type": "code", + "execution_count": 9, "metadata": { - "id": "HUKTqLHud7fo", - "scrolled": false + "id": "HUKTqLHud7fo" }, + "outputs": [], "source": [ "%%capture\n", "try:\n", - " \n", + "\n", " from google.colab import files\n", - " \n", + "\n", " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n", " !unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B\n", - " \n", + "\n", " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n", - " \n", + "\n", " BASE_DIR = 'DATAPATH'\n", - " \n", + "\n", "except ModuleNotFoundError:\n", - " \n", + "\n", " if not os.path.exists('Data/glove.6B'):\n", " os.mkdir('Data/glove.6B')\n", - " \n", - " url='http://nlp.stanford.edu/data/glove.6B.zip' \n", - " wget.download(url,'Data') \n", - " \n", - " temp='Data/glove.6B.zip' \n", - " file = ZipFile(temp) \n", - " file.extractall('Data/glove.6B') \n", + "\n", + " url='http://nlp.stanford.edu/data/glove.6B.zip'\n", + " wget.download(url,'Data')\n", + "\n", + " temp='Data/glove.6B.zip'\n", + " file = ZipFile(temp)\n", + " file.extractall('Data/glove.6B')\n", " file.close()\n", - " \n", - " \n", - " \n", + "\n", + "\n", + "\n", " if not os.path.exists('Data/aclImdb'):\n", - " \n", - " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n", + "\n", + " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'\n", " wget.download(url,'Data')\n", - " \n", - " temp='Data/aclImdb_v1.tar.gz' \n", + "\n", + " temp='Data/aclImdb_v1.tar.gz'\n", " tar = tarfile.open(temp, \"r:gz\")\n", - " tar.extractall('Data') \n", + " tar.extractall('Data')\n", " tar.close()\n", - " \n", + "\n", " BASE_DIR = 'Data'" - ], - "execution_count": 4, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 10, "metadata": { "id": "qvl1qb78fUib" }, + "outputs": [], "source": [ "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n", "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')\n", "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')" - ], - "execution_count": 5, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 11, "metadata": { "id": "Yu9xmAZEd7fp" }, + "outputs": [], "source": [ - "#Within these, I only have a pos/ and a neg/ folder containing text files \n", + "#Within these, I only have a pos/ and a neg/ folder containing text files\n", "MAX_SEQUENCE_LENGTH = 1000\n", - "MAX_NUM_WORDS = 20000 \n", - "EMBEDDING_DIM = 100 \n", + "MAX_NUM_WORDS = 20000\n", + "EMBEDDING_DIM = 100\n", "VALIDATION_SPLIT = 0.2\n", "\n", "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n", "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py" - ], - "execution_count": 6, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -264,14 +171,16 @@ }, "source": [ "### Loading and Preprocessing\n", - " " + "" ] }, { "cell_type": "code", + "execution_count": 12, "metadata": { "id": "WI4O1usEb5_O" }, + "outputs": [], "source": [ "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n", "def get_data(data_dir):\n", @@ -292,56 +201,64 @@ "\n", "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n", "test_texts, test_labels = get_data(TEST_DATA_DIR)\n", - "labels_index = {'pos':1, 'neg':0} \n", + "labels_index = {'pos':1, 'neg':0}\n", "\n", - "#Just to see how the data looks like. \n", + "#Just to see how the data looks like.\n", "#print(train_texts[0])\n", "#print(train_labels[0])\n", "#print(test_texts[24999])\n", "#print(test_labels[24999])" - ], - "execution_count": 7, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QhhqM0Jdd7fs", - "outputId": "9b5b394e-bc52-4779-d85d-a0383446051d" + "outputId": "9e16478c-8111-4aaf-e73b-b89359dd114f" }, - "source": [ - "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer \n", - "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. \n", - "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) \n", - "tokenizer.fit_on_texts(train_texts) \n", - "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes \n", - "test_sequences = tokenizer.texts_to_sequences(test_texts) \n", - "word_index = tokenizer.word_index \n", - "print('Found %s unique tokens.' % len(word_index))" - ], - "execution_count": 8, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "Found 88582 unique tokens.\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer\n", + "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.\n", + "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)\n", + "tokenizer.fit_on_texts(train_texts)\n", + "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes\n", + "test_sequences = tokenizer.texts_to_sequences(test_texts)\n", + "word_index = tokenizer.word_index\n", + "print('Found %s unique tokens.' % len(word_index))" ] }, { "cell_type": "code", + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_e0V1-bBb5_d", - "outputId": "d866429d-5bb6-43a7-c66e-ed5abbafc4cd" + "outputId": "94d409aa-5ac2-4b4a-809d-fc91d4563285" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Splitting the train data into train and valid is done\n" + ] + } + ], "source": [ "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n", "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n", @@ -362,27 +279,29 @@ "y_val = trainvalid_labels[-num_validation_samples:]\n", "#This is the data we will use for CNN and RNN training\n", "print('Splitting the train data into train and valid is done')" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Splitting the train data into train and valid is done\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WUHqg2vvb5_l", - "outputId": "8387eda1-18f0-4254-9819-e63191b8fc04" + "outputId": "0b0bc141-184a-4d99-bf55-360d0e6a0212" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Preparing embedding matrix.\n", + "Found 400000 word vectors in Glove embeddings.\n", + "Preparing of embedding matrix is done\n" + ] + } + ], "source": [ "print('Preparing embedding matrix.')\n", "\n", @@ -418,18 +337,6 @@ " input_length=MAX_SEQUENCE_LENGTH,\n", " trainable=False)\n", "print(\"Preparing of embedding matrix is done\")" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Preparing embedding matrix.\n", - "Found 400000 word vectors in Glove embeddings.\n", - "Preparing of embedding matrix is done\n" - ], - "name": "stdout" - } ] }, { @@ -443,13 +350,26 @@ }, { "cell_type": "code", + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TTY-4K-Ob5_t", - "outputId": "836681ca-936e-400a-8973-0754759bb7cd" + "outputId": "834682a9-9371-4769-a967-cc2b2c24eaa7" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Define a 1D CNN model.\n", + "157/157 [==============================] - 139s 878ms/step - loss: 0.6694 - acc: 0.6158 - val_loss: 0.5212 - val_acc: 0.7606\n", + "782/782 [==============================] - 56s 71ms/step - loss: 0.5251 - acc: 0.7537\n", + "Test accuracy with CNN: 0.7536799907684326\n" + ] + } + ], "source": [ "print('Define a 1D CNN model.')\n", "\n", @@ -467,30 +387,13 @@ "cnnmodel.compile(loss='categorical_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", + "#Train the model. Tune to validation set.\n", "cnnmodel.fit(x_train, y_train,\n", " batch_size=128,\n", " epochs=1, validation_data=(x_val, y_val))\n", "#Evaluate on test set:\n", "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", "print('Test accuracy with CNN:', acc)" - ], - "execution_count": 11, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Define a 1D CNN model.\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 156s 8ms/sample - loss: 0.6706 - acc: 0.5972 - val_loss: 0.5116 - val_acc: 0.7512\n", - "25000/25000 [==============================] - 67s 3ms/sample - loss: 0.5239 - acc: 0.7415\n", - "Test accuracy with CNN: 0.74152\n" - ], - "name": "stdout" - } ] }, { @@ -504,13 +407,26 @@ }, { "cell_type": "code", + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zI0bISwRb5_w", - "outputId": "d7697504-dacb-415c-b131-b89d6b10c771" + "outputId": "1c2285f4-edd4-4142-8f9e-7da2f9a91dc7" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", + "157/157 [==============================] - 216s 1s/step - loss: 0.6921 - acc: 0.5152 - val_loss: 0.6671 - val_acc: 0.6168\n", + "782/782 [==============================] - 66s 85ms/step - loss: 0.6667 - acc: 0.6200\n", + "Test accuracy with CNN: 0.6200399994850159\n" + ] + } + ], "source": [ "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n", "cnnmodel = Sequential()\n", @@ -527,30 +443,13 @@ "cnnmodel.compile(loss='categorical_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", + "#Train the model. Tune to validation set.\n", "cnnmodel.fit(x_train, y_train,\n", " batch_size=128,\n", " epochs=1, validation_data=(x_val, y_val))\n", "#Evaluate on test set:\n", "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", "print('Test accuracy with CNN:', acc)" - ], - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 234s 12ms/sample - loss: 0.5323 - acc: 0.6927 - val_loss: 0.3179 - val_acc: 0.8644\n", - "25000/25000 [==============================] - 84s 3ms/sample - loss: 0.3409 - acc: 0.8495\n", - "Test accuracy with CNN: 0.84948\n" - ], - "name": "stdout" - } ] }, { @@ -559,18 +458,32 @@ "id": "6GwhXpmSgt4H" }, "source": [ - "### LSTM Model with training your own embedding " + "### LSTM Model with training your own embedding" ] }, { "cell_type": "code", + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SvBt2Brib5_4", - "outputId": "008fe9fa-13bf-4127-ba46-67916426ddbe" + "outputId": "434183cf-f713-4911-b403-100223907162" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training an LSTM model, training embedding layer on the fly\n", + "Training the RNN\n", + "625/625 [==============================] - 1315s 2s/step - loss: 0.4609 - accuracy: 0.7807 - val_loss: 0.3932 - val_accuracy: 0.8286\n", + "782/782 [==============================] - 191s 245ms/step - loss: 0.4004 - accuracy: 0.8236\n", + "Test accuracy with RNN: 0.8235999941825867\n" + ] + } + ], "source": [ "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n", "\n", @@ -591,24 +504,6 @@ "score, acc = rnnmodel.evaluate(test_data, test_labels,\n", " batch_size=32)\n", "print('Test accuracy with RNN:', acc)" - ], - "execution_count": 13, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, training embedding layer on the fly\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", - "Training the RNN\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 1365s 68ms/sample - loss: 0.4997 - acc: 0.7506 - val_loss: 0.3839 - val_acc: 0.8403\n", - "25000/25000 [==============================] - 198s 8ms/sample - loss: 0.3962 - acc: 0.8300\n", - "Test accuracy with RNN: 0.82998\n" - ], - "name": "stdout" - } ] }, { @@ -622,13 +517,27 @@ }, { "cell_type": "code", + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Eymx0IyCb5_-", - "outputId": "da0fa303-a4c4-4b92-ff42-54f1a1d51e45" + "outputId": "2c6c182a-0dee-442c-f978-ac16e840b51f" }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training an LSTM model, using pre-trained embedding layer\n", + "Training the RNN\n", + "625/625 [==============================] - 1075s 2s/step - loss: 0.6050 - accuracy: 0.6728 - val_loss: 0.4578 - val_accuracy: 0.7916\n", + "782/782 [==============================] - 183s 234ms/step - loss: 0.4554 - accuracy: 0.7917\n", + "Test accuracy with RNN: 0.7916799783706665\n" + ] + } + ], "source": [ "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n", "\n", @@ -648,22 +557,39 @@ "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n", " batch_size=32)\n", "print('Test accuracy with RNN:', acc)" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, using pre-trained embedding layer\n", - "Training the RNN\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 1156s 58ms/sample - loss: 0.6122 - acc: 0.6602 - val_loss: 0.4538 - val_acc: 0.8017\n", - "25000/25000 [==============================] - 200s 8ms/sample - loss: 0.4666 - acc: 0.7930\n", - "Test accuracy with RNN: 0.793\n" - ], - "name": "stdout" - } ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tb81rafef3Wl" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb b/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb index 30eb9b3..a82ddd6 100644 --- a/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb +++ b/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb @@ -24,91 +24,21 @@ "id": "MK-POIlJE0Eu", "outputId": "490a8c7e-e8b3-4522-e448-37b50ef91109" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: tensorflow==1.14.0 in /usr/local/lib/python3.7/dist-packages (1.14.0)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.12.0)\n", - "Requirement already satisfied: tensorflow-estimator<1.15.0rc0,>=1.14.0rc0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.14.0)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.0)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.12.1)\n", - "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.36.2)\n", - "Requirement already satisfied: tensorboard<1.15.0,>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.14.0)\n", - "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.15.0)\n", - "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.2.0)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.34.1)\n", - "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.19.5)\n", - "Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.4.0)\n", - "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.0.8)\n", - "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (3.17.3)\n", - "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.8.1)\n", - "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.2)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (57.2.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.3.4)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (1.0.1)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from keras-applications>=1.0.6->tensorflow==1.14.0) (3.1.0)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (4.6.1)\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->keras-applications>=1.0.6->tensorflow==1.14.0) (1.5.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.7.4.3)\n", - "Requirement already satisfied: torch==1.9.0 in /usr/local/lib/python3.7/dist-packages (1.9.0+cu102)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.9.0) (3.7.4.3)\n", - "Requirement already satisfied: scikit-learn==0.21.3 in /usr/local/lib/python3.7/dist-packages (0.21.3)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: pytorch_pretrained_bert==0.6.2 in /usr/local/lib/python3.7/dist-packages (0.6.2)\n", - "Requirement already satisfied: pytorch-nlp==0.5.0 in /usr/local/lib/python3.7/dist-packages (0.5.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (2019.12.20)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.19.5)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (2.23.0)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (4.41.1)\n", - "Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.9.0+cu102)\n", - "Requirement already satisfied: boto3 in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.18.1)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (2021.5.30)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (3.0.4)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=0.4.1->pytorch_pretrained_bert==0.6.2) (3.7.4.3)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (0.10.0)\n", - "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (0.5.0)\n", - "Requirement already satisfied: botocore<1.22.0,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (1.21.1)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.22.0,>=1.21.1->boto3->pytorch_pretrained_bert==0.6.2) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.22.0,>=1.21.1->boto3->pytorch_pretrained_bert==0.6.2) (1.15.0)\n", - "Requirement already satisfied: tqdm==4.41.1 in /usr/local/lib/python3.7/dist-packages (4.41.1)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Requirement already satisfied: matplotlib==3.2.2 in /usr/local/lib/python3.7/dist-packages (3.2.2)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.3.1)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.8.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.4.7)\n", - "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.19.5)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (0.10.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib==3.2.2) (1.15.0)\n", - "Requirement already satisfied: beautifulsoup4==4.6.3 in /usr/local/lib/python3.7/dist-packages (4.6.3)\n" - ] - } - ], + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install tensorflow==1.14.0\n", - "!pip install torch==1.9.0\n", - "!pip install scikit-learn==0.21.3\n", - "!pip install pytorch_pretrained_bert==0.6.2 pytorch-nlp==0.5.0 \n", - "!pip install tqdm==4.41.1\n", - "!pip install pandas==1.1.5\n", - "!pip install matplotlib==3.2.2\n", - "!pip install beautifulsoup4==4.6.3\n", + "# !pip install numpy==1.19.5\n", + "# !pip install tensorflow==1.14.0\n", + "# !pip install torch==1.9.0\n", + "# !pip install scikit-learn==0.21.3\n", + "# !pip install pytorch_pretrained_bert==0.6.2 pytorch-nlp==0.5.0 \n", + "# !pip install tqdm==4.41.1\n", + "# !pip install pandas==1.1.5\n", + "# !pip install matplotlib==3.2.2\n", + "# !pip install beautifulsoup4==4.6.3\n", "\n", "# ===========================" ] @@ -140,7 +70,17 @@ "metadata": { "id": "TtokjlkCQbiw" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-15 11:05:48.226287: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 11:05:48.661080: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 11:05:50.154202: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + } + ], "source": [ "#importing a few necessary packages and setting the DATA directory\n", "DATA_DIR=\".\"\n", @@ -151,8 +91,6 @@ "import pickle\n", "import tensorflow as tf\n", "\n", - "\n", - "\n", "# BERT imports\n", "import torch\n", "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", @@ -199,36 +137,7 @@ "id": "BI8AvyFZRAha", "outputId": "b254d1da-f187-4c77-f1e0-748a5e6a8e90" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving IMDB Dataset.csv to IMDB Dataset.csv\n" - ] - } - ], + "outputs": [], "source": [ "# uploading and reading the dataset\n", "# source for dataset: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews\n", @@ -242,7 +151,7 @@ "except ModuleNotFoundError :\n", " \n", " # After downnloading the dataset, put the IMDB Dataset.csv file in Data folder.\n", - " df = pd.read_csv(\"Data/IMDB Dataset.csv\",engine='python', error_bad_lines=False)" + " df = pd.read_csv(\"Data/IMDB Dataset.csv\",engine='python', on_bad_lines='warn')" ] }, { @@ -322,9 +231,7 @@ ] }, "execution_count": 5, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -361,15 +268,14 @@ { "data": { "text/plain": [ + "sentiment\n", "1 25000\n", "0 25000\n", - "Name: sentiment, dtype: int64" + "Name: count, dtype: int64" ] }, "execution_count": 7, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -476,9 +382,7 @@ ] }, "execution_count": 9, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -526,17 +430,12 @@ "outputs": [ { "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, "text/plain": [ "\"[CLS] One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked They are right, as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO Trust me, this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs, sex or violence Its is hardcore, in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda Em City is home to manyAryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and moreso scuffles, death stares, dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare Forget pretty pictures painted for mainstream audiences, forget charm, forget romanceOZ doesn't mess around The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence Not just violence, but injustice crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience Watching Oz, you may become comfortable with what is uncomfortable viewingthats if you can get in touch with your darker side [SEP]\"" ] }, "execution_count": 11, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -564,13 +463,6 @@ "outputId": "e80ff8c7-991d-45a4-9caf-600f9e694998" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 231508/231508 [00:00<00:00, 312015.27B/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -718,13 +610,6 @@ "outputId": "d36884cd-ea8b-4954-ad2d-303d065f0ea0" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 407873900/407873900 [00:34<00:00, 11838492.74B/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -740,260 +625,7 @@ " )\n", " (encoder): BertEncoder(\n", " (layer): ModuleList(\n", - " (0): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (1): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (2): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (3): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (4): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (5): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (6): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (7): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (8): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (9): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (10): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (11): BertLayer(\n", + " (0-11): 12 x BertLayer(\n", " (attention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", @@ -1058,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1073,97 +705,8 @@ "output_type": "stream", "text": [ "t_total value of -1 results in schedule not being applied\n", - "Epoch: 0%| | 0/4 [00:00" - ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" } ], "source": [ @@ -1257,6 +800,20 @@ "plt.plot(train_loss_set)\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1267,7 +824,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1281,9 +838,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb b/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb index ffed776..83eed16 100644 --- a/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb +++ b/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -19,170 +19,22 @@ "id": "TF5qfV_flTbr", "outputId": "b536d10d-767d-4a8d-9cd6-2ea607550b1b" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting ktrain==0.26.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/4c/88/10d29578f47d0d140bf669d5598e9f5a50465ddc423b32031c65e840d003/ktrain-0.26.3.tar.gz (25.3MB)\n", - "\u001b[K |████████████████████████████████| 25.3MB 1.6MB/s \n", - "\u001b[?25hCollecting scikit-learn==0.23.2\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)\n", - "\u001b[K |████████████████████████████████| 6.8MB 41.4MB/s \n", - "\u001b[?25hRequirement already satisfied: matplotlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (3.2.2)\n", - "Requirement already satisfied: pandas>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.1.5)\n", - "Requirement already satisfied: fastprogress>=0.1.21 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.0.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (2.23.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.0.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (21.0)\n", - "Requirement already satisfied: ipython in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (5.5.0)\n", - "Collecting langdetect\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981kB)\n", - "\u001b[K |████████████████████████████████| 983kB 43.3MB/s \n", - "\u001b[?25hRequirement already satisfied: jieba in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (0.42.1)\n", - "Collecting cchardet\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)\n", - "\u001b[K |████████████████████████████████| 266kB 50.8MB/s \n", - "\u001b[?25hCollecting syntok\n", - " Downloading https://files.pythonhosted.org/packages/8c/76/a49e73a04b3e3a14ce232e8e28a1587f8108baa665644fe8c40e307e792e/syntok-1.3.1.tar.gz\n", - "Collecting seqeval==0.0.19\n", - " Downloading https://files.pythonhosted.org/packages/93/e5/b7705156a77f742cfe4fc6f22d0c71591edb2d243328dff2f8fc0f933ab6/seqeval-0.0.19.tar.gz\n", - "Collecting transformers<=4.3.3,>=4.0.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)\n", - "\u001b[K |████████████████████████████████| 1.9MB 37.5MB/s \n", - "\u001b[?25hCollecting sentencepiece\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)\n", - "\u001b[K |████████████████████████████████| 1.2MB 41.8MB/s \n", - "\u001b[?25hCollecting keras_bert>=0.86.0\n", - " Downloading https://files.pythonhosted.org/packages/6a/e4/3b2e2927c15c22f44005cb0ab0eaf2f7e623ea2b6488e4b7c5aca6c162c2/keras-bert-0.88.0.tar.gz\n", - "Requirement already satisfied: networkx>=2.3 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (2.5.1)\n", - "Collecting whoosh\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)\n", - "\u001b[K |████████████████████████████████| 471kB 34.8MB/s \n", - "\u001b[?25hRequirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.23.2->ktrain==0.26.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.23.2->ktrain==0.26.3) (1.19.5)\n", - "Collecting threadpoolctl>=2.0.0\n", - " Downloading https://files.pythonhosted.org/packages/c6/e8/c216b9b60cbba4642d3ca1bae7a53daa0c24426f662e0e3ce3dc7f6caeaa/threadpoolctl-2.2.0-py3-none-any.whl\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (0.10.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (1.3.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (2.4.7)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0.1->ktrain==0.26.3) (2018.9)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (2021.5.30)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (1.24.3)\n", - "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (0.8.1)\n", - "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (5.0.5)\n", - "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (57.2.0)\n", - "Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (4.4.2)\n", - "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (4.8.0)\n", - "Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (1.0.18)\n", - "Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (0.7.5)\n", - "Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (2.6.1)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from langdetect->ktrain==0.26.3) (1.15.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from syntok->ktrain==0.26.3) (2019.12.20)\n", - "Requirement already satisfied: Keras>=2.2.4 in /usr/local/lib/python3.7/dist-packages (from seqeval==0.0.19->ktrain==0.26.3) (2.4.3)\n", - "Collecting sacremoses\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", - "\u001b[K |████████████████████████████████| 901kB 41.4MB/s \n", - "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)\n", - "\u001b[K |████████████████████████████████| 3.3MB 37.9MB/s \n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (4.41.1)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.0.12)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (4.6.1)\n", - "Collecting keras-transformer>=0.39.0\n", - " Downloading https://files.pythonhosted.org/packages/8a/35/6b079e920fe09a9349028bc2f209447e5636d90e29c5cf060bcc3177803a/keras-transformer-0.39.0.tar.gz\n", - "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/dist-packages (from traitlets>=4.2->ipython->ktrain==0.26.3) (0.2.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.7/dist-packages (from pexpect; sys_platform != \"win32\"->ipython->ktrain==0.26.3) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython->ktrain==0.26.3) (0.2.5)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (3.13)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (3.1.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (7.1.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.7.4.3)\n", - "Collecting keras-pos-embd>=0.12.0\n", - " Downloading https://files.pythonhosted.org/packages/d8/d2/1cc072ea68b573f366e08936177a33e237e66fa7d5338289d4bee64696cf/keras-pos-embd-0.12.0.tar.gz\n", - "Collecting keras-multi-head>=0.28.0\n", - " Downloading https://files.pythonhosted.org/packages/a5/e6/a83f26b2e1582de237b125f595874d808e40698f31d44d5903e872d5b64d/keras-multi-head-0.28.0.tar.gz\n", - "Collecting keras-layer-normalization>=0.15.0\n", - " Downloading https://files.pythonhosted.org/packages/33/e1/0da586d544a0940a56a2f4aa704b7dbd95eaa8ceda6168b48f5ac95e6608/keras-layer-normalization-0.15.0.tar.gz\n", - "Collecting keras-position-wise-feed-forward>=0.7.0\n", - " Downloading https://files.pythonhosted.org/packages/58/02/cd3e7e51cf45d3825818384a2f7d9c340b60c9bf55a5682b7318e1c16eab/keras-position-wise-feed-forward-0.7.0.tar.gz\n", - "Collecting keras-embed-sim>=0.9.0\n", - " Downloading https://files.pythonhosted.org/packages/2d/48/78f6d134f1ede597d91186819c9e428ada51cd8d9ea28e5faf37ed2ee602/keras-embed-sim-0.9.0.tar.gz\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (1.5.2)\n", - "Collecting keras-self-attention>=0.50.0\n", - " Downloading https://files.pythonhosted.org/packages/ea/75/e6bc5b43ee968fef714f2f10a2a1674639ec85d2428cc47b2fe1f9af0115/keras-self-attention-0.50.0.tar.gz\n", - "Building wheels for collected packages: ktrain, langdetect, syntok, seqeval, keras-bert, keras-transformer, keras-pos-embd, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-self-attention\n", - " Building wheel for ktrain (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ktrain: filename=ktrain-0.26.3-cp37-none-any.whl size=25282390 sha256=0f129e50aaa4d78ab674e5f6b95d1c66df8f4fa6b62a1ac02b1867c70bbbdecd\n", - " Stored in directory: /root/.cache/pip/wheels/16/05/be/d6e659b3349016b1059e19fa028f165af4eeae2c196f329112\n", - " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for langdetect: filename=langdetect-1.0.9-cp37-none-any.whl size=993242 sha256=489499c000ae032ae91b31fdedc7ec2d0ef1fccbd3b997508abed818cd45e520\n", - " Stored in directory: /root/.cache/pip/wheels/7e/18/13/038c34057808931c7ddc6c92d3aa015cf1a498df5a70268996\n", - " Building wheel for syntok (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for syntok: filename=syntok-1.3.1-cp37-none-any.whl size=20919 sha256=d2ed41e31e9075584cdf09f7dcd35228826294d924d63d47ea163b49411be409\n", - " Stored in directory: /root/.cache/pip/wheels/51/c6/a4/be1920586c49469846bcd2888200bdecfe109ec421dab9be2d\n", - " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for seqeval: filename=seqeval-0.0.19-cp37-none-any.whl size=9932 sha256=0f0d22a626859918451e3439effefce4ee362409cc6a0afe0d953ebb60ab7e3b\n", - " Stored in directory: /root/.cache/pip/wheels/8d/1f/bf/1198beceed805a2099060975f6281d1b01046dd279e19c97be\n", - " Building wheel for keras-bert (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-bert: filename=keras_bert-0.88.0-cp37-none-any.whl size=34206 sha256=847f5cdc7a31d9961b28e9c07b757d07c882be271c96a892b97c68b0ce425518\n", - " Stored in directory: /root/.cache/pip/wheels/7f/d8/86/b4d91b941f6f3256c487b258d5e4268a3301203b717dd11f11\n", - " Building wheel for keras-transformer (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-transformer: filename=keras_transformer-0.39.0-cp37-none-any.whl size=12841 sha256=ddc1c3d23d4f739bf6269455f093aeac6995c0ec9a6d38ac3af3e5744d012e57\n", - " Stored in directory: /root/.cache/pip/wheels/77/42/35/d33c5907bca04ac5742e9eceefb644b680286de26728506a70\n", - " Building wheel for keras-pos-embd (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-pos-embd: filename=keras_pos_embd-0.12.0-cp37-none-any.whl size=7471 sha256=c3a4694a4c7002edf0f340cc03c67b10a1d2b07aa9a6d8cadd4c417d56be27dd\n", - " Stored in directory: /root/.cache/pip/wheels/36/d8/36/06ed09215806dca9ff504d8c0dda5da68d7f2c67d34a231d82\n", - " Building wheel for keras-multi-head (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-multi-head: filename=keras_multi_head-0.28.0-cp37-none-any.whl size=15559 sha256=8bdc034bc047b17f8a5b6c5f9e22e5a9d500f0d7c6068048d8549ce75d8a0237\n", - " Stored in directory: /root/.cache/pip/wheels/ec/92/bd/b3407bc29501f7e28eb970a6c425a9a375485c5d8197df6a8f\n", - " Building wheel for keras-layer-normalization (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.15.0-cp37-none-any.whl size=5224 sha256=1f6ae80fd5d9dfe471270c411aff14452b56bcad9abb0ada2e955ee9de5ad0b4\n", - " Stored in directory: /root/.cache/pip/wheels/de/ea/db/833c8a9b8326e703e9f8a78c0d4153294e6a1b1f97a1836397\n", - " Building wheel for keras-position-wise-feed-forward (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-position-wise-feed-forward: filename=keras_position_wise_feed_forward-0.7.0-cp37-none-any.whl size=5542 sha256=8794e806be2e654221719710e19e5219aa71df372998360a12e73281735ff2b8\n", - " Stored in directory: /root/.cache/pip/wheels/d2/d2/f6/58ce0aae0055dbccba8b40e62a6c22ab997105ad8c431a9e80\n", - " Building wheel for keras-embed-sim (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-embed-sim: filename=keras_embed_sim-0.9.0-cp37-none-any.whl size=4505 sha256=1167638915a16210f6e8f52ed29f112b1ff7abfb096bbb3c25f9b9c3d0ab52c6\n", - " Stored in directory: /root/.cache/pip/wheels/c1/d5/7d/bef5ee93c88bc6150294cc74cbb081647c505bf816918dd7ff\n", - " Building wheel for keras-self-attention (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-self-attention: filename=keras_self_attention-0.50.0-cp37-none-any.whl size=19416 sha256=40ca0b3fc31fcb65f163ad549b188cedd452b68d48d21e636b467155b7e43d73\n", - " Stored in directory: /root/.cache/pip/wheels/29/93/0d/891573db60f74d0e43bd7db1496c3ef898f8b5946a4c24cbda\n", - "Successfully built ktrain langdetect syntok seqeval keras-bert keras-transformer keras-pos-embd keras-multi-head keras-layer-normalization keras-position-wise-feed-forward keras-embed-sim keras-self-attention\n", - "Installing collected packages: threadpoolctl, scikit-learn, langdetect, cchardet, syntok, seqeval, sacremoses, tokenizers, transformers, sentencepiece, keras-pos-embd, keras-self-attention, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-transformer, keras-bert, whoosh, ktrain\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed cchardet-2.1.7 keras-bert-0.88.0 keras-embed-sim-0.9.0 keras-layer-normalization-0.15.0 keras-multi-head-0.28.0 keras-pos-embd-0.12.0 keras-position-wise-feed-forward-0.7.0 keras-self-attention-0.50.0 keras-transformer-0.39.0 ktrain-0.26.3 langdetect-1.0.9 sacremoses-0.0.45 scikit-learn-0.23.2 sentencepiece-0.1.96 seqeval-0.0.19 syntok-1.3.1 threadpoolctl-2.2.0 tokenizers-0.10.3 transformers-4.3.3 whoosh-2.7.4\n" - ] - } - ], + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install ktrain==0.26.3\n", + "# !pip install numpy==1.19.5\n", + "# !pip install pandas==1.1.5\n", + "# !pip install ktrain==0.26.3\n", "\n", "# ===========================" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "id": "_UN7tuqnlTbs" }, @@ -203,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -211,91 +63,28 @@ "id": "58WB13Jx3rQm", "outputId": "9af6cd3f-771e-4807-d041-bb8a3290bea1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting tensorflow==2.4.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/94/0a/012cc33c643d844433d13001dd1db179e7020b05ddbbd0a9dc86c38a8efa/tensorflow-2.4.0-cp37-cp37m-manylinux2010_x86_64.whl (394.7MB)\n", - "\u001b[K |████████████████████████████████| 394.7MB 41kB/s \n", - "\u001b[?25hCollecting h5py~=2.10.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3f/c0/abde58b837e066bca19a3f7332d9d0493521d7dd6b48248451a9e3fe2214/h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9MB)\n", - "\u001b[K |████████████████████████████████| 2.9MB 45.8MB/s \n", - "\u001b[?25hRequirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.1.0)\n", - "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.12.0)\n", - "Requirement already satisfied: numpy~=1.19.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.19.5)\n", - "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.15.0)\n", - "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.12.1)\n", - "Collecting tensorflow-estimator<2.5.0,>=2.4.0rc0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/74/7e/622d9849abf3afb81e482ffc170758742e392ee129ce1540611199a59237/tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462kB)\n", - "\u001b[K |████████████████████████████████| 471kB 39.7MB/s \n", - "\u001b[?25hCollecting grpcio~=1.32.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/06/54/1c8be62beafe7fb1548d2968e518ca040556b46b0275399d4f3186c56d79/grpcio-1.32.0-cp37-cp37m-manylinux2014_x86_64.whl (3.8MB)\n", - "\u001b[K |████████████████████████████████| 3.8MB 37.5MB/s \n", - "\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.3.0)\n", - "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.7.4.3)\n", - "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.12)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.17.3)\n", - "Requirement already satisfied: tensorboard~=2.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (2.5.0)\n", - "Collecting gast==0.3.3\n", - " Downloading https://files.pythonhosted.org/packages/d6/84/759f5dd23fec8ba71952d97bcc7e2c9d7d63bdc582421f3cd4be845f0c98/gast-0.3.3-py2.py3-none-any.whl\n", - "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.36.2)\n", - "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.6.3)\n", - "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.2.0)\n", - "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.1.2)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (0.4.4)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.32.1)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.8.0)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.0.1)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (2.23.0)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (57.2.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (3.3.4)\n", - "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (0.6.1)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow==2.4.0) (1.3.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (0.2.8)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (4.2.2)\n", - "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3.6\" in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (4.7.2)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (2021.5.30)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (2.10)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard~=2.4->tensorflow==2.4.0) (4.6.1)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow==2.4.0) (3.1.1)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (0.4.8)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.4->tensorflow==2.4.0) (3.5.0)\n", - "Installing collected packages: h5py, tensorflow-estimator, grpcio, gast, tensorflow\n", - " Found existing installation: h5py 3.1.0\n", - " Uninstalling h5py-3.1.0:\n", - " Successfully uninstalled h5py-3.1.0\n", - " Found existing installation: tensorflow-estimator 2.5.0\n", - " Uninstalling tensorflow-estimator-2.5.0:\n", - " Successfully uninstalled tensorflow-estimator-2.5.0\n", - " Found existing installation: grpcio 1.34.1\n", - " Uninstalling grpcio-1.34.1:\n", - " Successfully uninstalled grpcio-1.34.1\n", - " Found existing installation: gast 0.4.0\n", - " Uninstalling gast-0.4.0:\n", - " Successfully uninstalled gast-0.4.0\n", - " Found existing installation: tensorflow 2.5.0\n", - " Uninstalling tensorflow-2.5.0:\n", - " Successfully uninstalled tensorflow-2.5.0\n", - "Successfully installed gast-0.3.3 grpcio-1.32.0 h5py-2.10.0 tensorflow-2.4.0 tensorflow-estimator-2.4.0\n" - ] - } - ], + "outputs": [], "source": [ "# use tensorflow 2.4.0 for this notebook\n", - "!pip install tensorflow==2.4.0" + "# !pip install tensorflow==2.4.0" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "id": "KN6N85ah8VXf" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "#Importing\n", "import ktrain\n", @@ -304,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -312,16 +101,7 @@ "id": "Mr1YXudk8Vti", "outputId": "4634f5ee-9c9d-4a32-9118-1845b6c43b7f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", - "84131840/84125825 [==============================] - 6s 0us/step\n" - ] - } - ], + "outputs": [], "source": [ "##obtain the dataset\n", "import os\n", @@ -335,7 +115,7 @@ " )\n", " IMDB_DATADIR = os.path.join(os.path.dirname(dataset), \"aclImdb\")\n", "except ModuleNotFoundError :\n", - " if not os.path.exists(os.getcwd()+\"\\\\Data\\\\aclImdb\") :\n", + " if not os.path.exists(os.getcwd()+\"/Data/aclImdb\") :\n", " import tensorflow as tf\n", " dataset = tf.keras.utils.get_file(\n", " fname=\"aclImdb.tar.gz\", \n", @@ -348,7 +128,7 @@ " else :\n", "\n", " # set path to dataset\n", - " IMDB_DATADIR=os.getcwd()+\"\\\\Data\\\\aclImdb\"" + " IMDB_DATADIR=os.getcwd()+\"/Data/aclImdb\"" ] }, { @@ -363,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -378,14 +158,6 @@ "output_type": "stream", "text": [ "detected encoding: utf-8\n", - "downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...\n", - "[██████████████████████████████████████████████████]\n", - "extracting pretrained BERT model...\n", - "done.\n", - "\n", - "cleanup downloaded zip...\n", - "done.\n", - "\n", "preprocessing train...\n", "language: en\n" ] @@ -393,15 +165,40 @@ { "data": { "text/html": [ - "done." + "\n", + "\n" ], "text/plain": [ "" ] }, - "metadata": { - "tags": [] + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "done." + ], + "text/plain": [ + "" + ] }, + "metadata": {}, "output_type": "display_data" }, { @@ -416,15 +213,40 @@ { "data": { "text/html": [ - "done." + "\n", + "\n" ], "text/plain": [ "" ] }, - "metadata": { - "tags": [] + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "done." + ], + "text/plain": [ + "" + ] }, + "metadata": {}, "output_type": "display_data" } ], @@ -448,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -462,7 +284,21 @@ "output_type": "stream", "text": [ "Is Multi-Label? False\n", - "maxlen is 500\n", + "maxlen is 500\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.9/site-packages/keras/src/initializers/initializers.py:120: UserWarning: The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "done.\n" ] } @@ -483,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -499,32 +335,20 @@ "\n", "\n", "begin training using onecycle policy with max lr of 2e-05...\n", - "Epoch 1/4\n", - "4167/4167 [==============================] - 2358s 561ms/step - loss: 0.3327 - accuracy: 0.8504 - val_loss: 0.1843 - val_accuracy: 0.9311\n", - "Epoch 2/4\n", - "4167/4167 [==============================] - 2325s 558ms/step - loss: 0.1542 - accuracy: 0.9423 - val_loss: 0.2223 - val_accuracy: 0.9138\n", - "Epoch 3/4\n", - "4167/4167 [==============================] - 2323s 557ms/step - loss: 0.0899 - accuracy: 0.9677 - val_loss: 0.1847 - val_accuracy: 0.9350\n", - "Epoch 4/4\n", - "4167/4167 [==============================] - 2322s 557ms/step - loss: 0.0247 - accuracy: 0.9934 - val_loss: 0.2330 - val_accuracy: 0.9416\n" + "Epoch 1/4\n" ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" } ], "source": [ "learner.fit_onecycle(2e-5, 4)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -535,7 +359,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -549,9 +373,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 }