{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(action='ignore')\n", "\n", "def kaggle_format(df):\n", " df['label'][df['label'] == 'none'] = 0\n", " df['label'][df['label'] == 'offensive'] = 1\n", " df['label'][df['label'] == 'hate'] = 2\n", " return df" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "# train 데이터 / 7893/ \n", "# 'none'/ 'offensive'/ 'hate' : 3486/ 2498/ 1909\n", "train = pd.read_csv('total_20210121.csv')\n", "train = train[['comments', 'hate']]\n", "train.columns = ['comments', 'label']\n", "train = kaggle_format(train)\n", "train = train.astype({'label': 'str'})\n", "\n", "# dev 데이터 / 471/ \n", "# 'none'/ 'offensive'/ 'hate' : 160/ 189/ 122\n", "dev = pd.read_csv('./korean-hate-speech-master/labeled/dev.tsv', sep='\\t')\n", "dev = dev[['comments', 'hate']]\n", "dev.columns = ['comments', 'label']\n", "dev = kaggle_format(dev)\n", "dev = dev.astype({'label': 'str'})\n", "\n", "test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X, y = train.comments, train.label\n", "X_test, y_test = dev.comments, dev.label" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vect_cv = CountVectorizer(min_df = 0.0, analyzer = 'char', \n", " ngram_range = (1,3), max_features=5000) \n", "\n", "vect_tf = TfidfVectorizer(min_df = 0.0, analyzer = 'char', sublinear_tf=True, \n", " ngram_range = (1,3), max_features=5000)\n", "\n", "X_cv = vect_cv.fit_transform(X)\n", "X_tf = vect_tf.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import KFold\n", "from sklearn.model_selection import StratifiedKFold\n", "\n", "kf = KFold(n_splits=10)\n", "skfold = StratifiedKFold(n_splits=10)\n", "\n", "lgs = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=13)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "tf_f1_sk = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=skfold)\n", "cv_f1_sk = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=skfold)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "tf_f1 = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=kf)\n", "cv_f1 = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=kf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### skfold kfold 둘다" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KFold f1_macro :: TF : [0.54409835 0.5751475 0.55495904 0.53784724 0.5603153 0.52843219\n", " 0.56598094 0.53796904 0.55379309 0.56143291] \n", "\n", "KFold f1_macro :: CV : [0.54380155 0.54360996 0.53902544 0.53090814 0.54776016 0.52127145\n", " 0.55323764 0.54966987 0.53721223 0.52919632]\n", "\n", "KFold f1_macro :: TF : 0.5519975598739248 , CV : 0.5395692760168908\n" ] } ], "source": [ "print('KFold f1_macro :: TF : ', tf_f1, '\\n\\nKFold f1_macro :: CV : ', cv_f1)\n", "print('\\nKFold f1_macro :: TF : ', np.mean(tf_f1), ', CV : ', np.mean(cv_f1))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SKFold f1_macro :: TF : [0.54163117 0.56082713 0.5771245 0.53130162 0.55013692 0.53510416\n", " 0.58301114 0.54874383 0.54916122 0.5544463 ] \n", "\n", "SKFold f1_macro :: CV : [0.53854502 0.53605738 0.55034544 0.5346379 0.54060879 0.5322481\n", " 0.56368439 0.55686064 0.53945839 0.51794944]\n", "\n", "SKFold f1_macro :: TF : 0.5531487999460587 , CV : 0.5410395492065031\n" ] } ], "source": [ "print('SKFold f1_macro :: TF : ', tf_f1_sk, '\\n\\nSKFold f1_macro :: CV : ', cv_f1_sk)\n", "print('\\nSKFold f1_macro :: TF : ', np.mean(tf_f1_sk), ', CV : ', np.mean(cv_f1_sk))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def get_f1_score(pre_data):\n", " vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=(1, 3), sublinear_tf=True,\n", " max_features='5000')\n", " X_tf = vec.fit_transform(pre_data)\n", " print(X_tf.shape)\n", " lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, \n", " class_weight='balanced', \n", " max_iter=6000, random_state=10)\n", " lgs.fit(X_tf, y)\n", " X_test_tf = vec.transform(X_test)\n", " pred = lgs.predict(X_test_tf)\n", " score = f1_score(y_test, pred, average='macro')\n", " return score" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dev 파일 :: TF : 0.5773105429455988\n", "dev 파일 :: CV : 0.5594804815636172\n" ] } ], "source": [ "from sklearn.metrics import f1_score\n", "lgs.fit(X_tf, y)\n", "X_test1 = vect_tf.transform(X_test)\n", "pred_tf = lgs.predict(X_test1)\n", "print('dev 파일 :: TF : ', f1_score(y_test, pred_tf, average='macro'))\n", "\n", "lgs.fit(X_cv, y)\n", "X_test2 = vect_cv.transform(X_test)\n", "pred_cv = lgs.predict(X_test2)\n", "print('dev 파일 :: CV : ', f1_score(y_test, pred_cv, average='macro'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }