{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(action='ignore')\n",
    "\n",
    "def kaggle_format(df):\n",
    "    df['label'][df['label'] == 'none'] = 0\n",
    "    df['label'][df['label'] == 'offensive'] = 1\n",
    "    df['label'][df['label'] == 'hate'] = 2\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# train 데이터 / 7893/ \n",
    "# 'none'/ 'offensive'/ 'hate' : 3486/ 2498/ 1909\n",
    "train = pd.read_csv('total_20210121.csv')\n",
    "train = train[['comments', 'hate']]\n",
    "train.columns = ['comments', 'label']\n",
    "train = kaggle_format(train)\n",
    "train = train.astype({'label': 'str'})\n",
    "\n",
    "# dev 데이터 / 471/ \n",
    "# 'none'/ 'offensive'/ 'hate' : 160/ 189/ 122\n",
    "dev = pd.read_csv('./korean-hate-speech-master/labeled/dev.tsv', sep='\\t')\n",
    "dev = dev[['comments', 'hate']]\n",
    "dev.columns = ['comments', 'label']\n",
    "dev = kaggle_format(dev)\n",
    "dev = dev.astype({'label': 'str'})\n",
    "\n",
    "test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = train.comments, train.label\n",
    "X_test, y_test = dev.comments, dev.label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vect_cv = CountVectorizer(min_df = 0.0, analyzer = 'char', \n",
    "    ngram_range = (1,3), max_features=5000) \n",
    "\n",
    "vect_tf = TfidfVectorizer(min_df = 0.0, analyzer = 'char', sublinear_tf=True, \n",
    "    ngram_range = (1,3), max_features=5000)\n",
    "\n",
    "X_cv = vect_cv.fit_transform(X)\n",
    "X_tf = vect_tf.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "kf = KFold(n_splits=10)\n",
    "skfold = StratifiedKFold(n_splits=10)\n",
    "\n",
    "lgs = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=13)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_f1_sk = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=skfold)\n",
    "cv_f1_sk = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=skfold)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_f1 = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=kf)\n",
    "cv_f1 = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=kf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### skfold kfold 둘다"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KFold f1_macro :: TF :  [0.54409835 0.5751475  0.55495904 0.53784724 0.5603153  0.52843219\n",
      " 0.56598094 0.53796904 0.55379309 0.56143291] \n",
      "\n",
      "KFold f1_macro :: CV :  [0.54380155 0.54360996 0.53902544 0.53090814 0.54776016 0.52127145\n",
      " 0.55323764 0.54966987 0.53721223 0.52919632]\n",
      "\n",
      "KFold f1_macro :: TF :  0.5519975598739248 , CV :  0.5395692760168908\n"
     ]
    }
   ],
   "source": [
    "print('KFold f1_macro :: TF : ', tf_f1, '\\n\\nKFold f1_macro :: CV : ', cv_f1)\n",
    "print('\\nKFold f1_macro :: TF : ', np.mean(tf_f1), ', CV : ', np.mean(cv_f1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SKFold f1_macro :: TF :  [0.54163117 0.56082713 0.5771245  0.53130162 0.55013692 0.53510416\n",
      " 0.58301114 0.54874383 0.54916122 0.5544463 ] \n",
      "\n",
      "SKFold f1_macro :: CV :  [0.53854502 0.53605738 0.55034544 0.5346379  0.54060879 0.5322481\n",
      " 0.56368439 0.55686064 0.53945839 0.51794944]\n",
      "\n",
      "SKFold f1_macro :: TF :  0.5531487999460587 , CV :  0.5410395492065031\n"
     ]
    }
   ],
   "source": [
    "print('SKFold f1_macro :: TF : ', tf_f1_sk, '\\n\\nSKFold f1_macro :: CV : ', cv_f1_sk)\n",
    "print('\\nSKFold f1_macro :: TF : ', np.mean(tf_f1_sk), ', CV : ', np.mean(cv_f1_sk))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_f1_score(pre_data):\n",
    "    vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=(1, 3), sublinear_tf=True,\n",
    "               max_features='5000')\n",
    "    X_tf = vec.fit_transform(pre_data)\n",
    "    print(X_tf.shape)\n",
    "    lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, \n",
    "                         class_weight='balanced', \n",
    "                         max_iter=6000, random_state=10)\n",
    "    lgs.fit(X_tf, y)\n",
    "    X_test_tf = vec.transform(X_test)\n",
    "    pred =  lgs.predict(X_test_tf)\n",
    "    score = f1_score(y_test, pred, average='macro')\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dev 파일 :: TF :  0.5773105429455988\n",
      "dev 파일 :: CV :  0.5594804815636172\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "lgs.fit(X_tf, y)\n",
    "X_test1 = vect_tf.transform(X_test)\n",
    "pred_tf = lgs.predict(X_test1)\n",
    "print('dev 파일 :: TF : ', f1_score(y_test, pred_tf, average='macro'))\n",
    "\n",
    "lgs.fit(X_cv, y)\n",
    "X_test2 = vect_cv.transform(X_test)\n",
    "pred_cv = lgs.predict(X_test2)\n",
    "print('dev 파일 :: CV : ', f1_score(y_test, pred_cv, average='macro'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}