In [1]:
import warnings
warnings.filterwarnings(action='ignore')

def kaggle_format(df):
    df['label'][df['label'] == 'none'] = 0
    df['label'][df['label'] == 'offensive'] = 1
    df['label'][df['label'] == 'hate'] = 2
    return df

In [2]:
import pandas as pd
# train 데이터 / 7893/ 
# 'none'/ 'offensive'/ 'hate' : 3486/ 2498/ 1909
train = pd.read_csv('total_20210121.csv')
train = train[['comments', 'hate']]
train.columns = ['comments', 'label']
train = kaggle_format(train)
train = train.astype({'label': 'str'})

# dev 데이터 / 471/ 
# 'none'/ 'offensive'/ 'hate' : 160/ 189/ 122
dev = pd.read_csv('./korean-hate-speech-master/labeled/dev.tsv', sep='\t')
dev = dev[['comments', 'hate']]
dev.columns = ['comments', 'label']
dev = kaggle_format(dev)
dev = dev.astype({'label': 'str'})

test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\t')

In [4]:
X, y = train.comments, train.label
X_test, y_test = dev.comments, dev.label

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vect_cv = CountVectorizer(min_df = 0.0, analyzer = 'char', 
    ngram_range = (1,3), max_features=5000) 

vect_tf = TfidfVectorizer(min_df = 0.0, analyzer = 'char', sublinear_tf=True, 
    ngram_range = (1,3), max_features=5000)

X_cv = vect_cv.fit_transform(X)
X_tf = vect_tf.fit_transform(X)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

kf = KFold(n_splits=10)
skfold = StratifiedKFold(n_splits=10)

lgs = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=13)

In [13]:
tf_f1_sk = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=skfold)
cv_f1_sk = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=skfold)

In [14]:
tf_f1 = cross_val_score(lgs, X_tf, y, scoring='f1_macro', cv=kf)
cv_f1 = cross_val_score(lgs, X_cv, y, scoring='f1_macro', cv=kf)

#### skfold kfold 둘다

In [24]:
print('KFold f1_macro :: TF : ', tf_f1, '\n\nKFold f1_macro :: CV : ', cv_f1)
print('\nKFold f1_macro :: TF : ', np.mean(tf_f1), ', CV : ', np.mean(cv_f1))

KFold f1_macro :: TF :  [0.54409835 0.5751475  0.55495904 0.53784724 0.5603153  0.52843219
 0.56598094 0.53796904 0.55379309 0.56143291] 

KFold f1_macro :: CV :  [0.54380155 0.54360996 0.53902544 0.53090814 0.54776016 0.52127145
 0.55323764 0.54966987 0.53721223 0.52919632]

KFold f1_macro :: TF :  0.5519975598739248 , CV :  0.5395692760168908


In [25]:
print('SKFold f1_macro :: TF : ', tf_f1_sk, '\n\nSKFold f1_macro :: CV : ', cv_f1_sk)
print('\nSKFold f1_macro :: TF : ', np.mean(tf_f1_sk), ', CV : ', np.mean(cv_f1_sk))

SKFold f1_macro :: TF :  [0.54163117 0.56082713 0.5771245  0.53130162 0.55013692 0.53510416
 0.58301114 0.54874383 0.54916122 0.5544463 ] 

SKFold f1_macro :: CV :  [0.53854502 0.53605738 0.55034544 0.5346379  0.54060879 0.5322481
 0.56368439 0.55686064 0.53945839 0.51794944]

SKFold f1_macro :: TF :  0.5531487999460587 , CV :  0.5410395492065031


In [26]:
def get_f1_score(pre_data):
    vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=(1, 3), sublinear_tf=True,
               max_features='5000')
    X_tf = vec.fit_transform(pre_data)
    print(X_tf.shape)
    lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, 
                         class_weight='balanced', 
                         max_iter=6000, random_state=10)
    lgs.fit(X_tf, y)
    X_test_tf = vec.transform(X_test)
    pred =  lgs.predict(X_test_tf)
    score = f1_score(y_test, pred, average='macro')
    return score

In [32]:
from sklearn.metrics import f1_score
lgs.fit(X_tf, y)
X_test1 = vect_tf.transform(X_test)
pred_tf = lgs.predict(X_test1)
print('dev 파일 :: TF : ', f1_score(y_test, pred_tf, average='macro'))

lgs.fit(X_cv, y)
X_test2 = vect_cv.transform(X_test)
pred_cv = lgs.predict(X_test2)
print('dev 파일 :: CV : ', f1_score(y_test, pred_cv, average='macro'))

dev 파일 :: TF :  0.5773105429455988
dev 파일 :: CV :  0.5594804815636172
