|
| 1 | +"""Load kocohub/korean-hate-speech dataset |
| 2 | +
|
| 3 | +The dataset contains |
| 4 | + - labeled train, dev set |
| 5 | + - label-removed test set |
| 6 | + - unlabeled data |
| 7 | + - news title for each train, dev, test, and unlabeled corpus |
| 8 | +
|
| 9 | +For more information, see https://github.com/kocohub/korean-hate-speech |
| 10 | +""" |
| 11 | + |
| 12 | +import pandas as pd |
| 13 | + |
| 14 | +from ..utils import DOWNLOAD_DIR, read_lines |
| 15 | + |
| 16 | +dataset = 'korean-hate-speech' |
| 17 | +datadir = f'{DOWNLOAD_DIR}/{dataset}-master' |
| 18 | + |
| 19 | + |
| 20 | +def _load_labeled(): |
| 21 | + """Load labeled train, dev set |
| 22 | +
|
| 23 | + Returns: |
| 24 | + labeled_dataset (dict): |
| 25 | + { |
| 26 | + 'train': [ |
| 27 | + { |
| 28 | + 'comments': str, |
| 29 | + 'contain_gender_bias: bool, |
| 30 | + 'bias': str, |
| 31 | + 'hate': str, |
| 32 | + 'news_title': str, |
| 33 | + }, |
| 34 | + ... |
| 35 | + ] |
| 36 | +
|
| 37 | + 'dev': [ |
| 38 | + { |
| 39 | + 'comments': str, |
| 40 | + 'contain_gender_bias: bool, |
| 41 | + 'bias': str, |
| 42 | + 'hate': str, |
| 43 | + 'news_title': str, |
| 44 | + }, |
| 45 | + ... |
| 46 | + ] |
| 47 | + } |
| 48 | + """ |
| 49 | + train = pd.read_csv(f'{datadir}/labeled/train.tsv', sep='\t') |
| 50 | + dev = pd.read_csv(f'{datadir}/labeled/dev.tsv', sep='\t') |
| 51 | + train_news_title = read_lines(f'{datadir}/news_title/train.news_title.txt') |
| 52 | + dev_news_title = read_lines(f'{datadir}/news_title/dev.news_title.txt') |
| 53 | + assert train.shape[0] == len(train_news_title) |
| 54 | + assert dev.shape[0] == len(dev_news_title) |
| 55 | + |
| 56 | + train['news_title'] = train_news_title |
| 57 | + dev['news_title'] = dev_news_title |
| 58 | + |
| 59 | + labeled_dataset = dict() |
| 60 | + labeled_dataset['train'] = train.to_dict('records') |
| 61 | + labeled_dataset['dev'] = dev.to_dict('records') |
| 62 | + return labeled_dataset |
| 63 | + |
| 64 | + |
| 65 | +def _load_unlabeled(): |
| 66 | + """Load unlabeled corpus |
| 67 | +
|
| 68 | + Returns: |
| 69 | + unlabeled_dataset (list of dict): |
| 70 | + [ |
| 71 | + { |
| 72 | + 'comments': str, |
| 73 | + 'news_title': str, |
| 74 | + }, ... |
| 75 | + ] |
| 76 | + """ |
| 77 | + unlabeled_comments = [] |
| 78 | + unlabeled_news_titles = [] |
| 79 | + for i in range(5): |
| 80 | + unlabeled_comments_tmp = read_lines(f'{datadir}/unlabeled/unlabeled_comments_{i}.txt') |
| 81 | + unlabeled_comments.extend(unlabeled_comments_tmp) |
| 82 | + unlabeled_news_title_tmp = read_lines(f'{datadir}/news_title/unlabeled_comments.news_title_{i}.txt') |
| 83 | + unlabeled_news_titles.extend(unlabeled_news_title_tmp) |
| 84 | + assert len(unlabeled_comments) == len(unlabeled_news_titles) |
| 85 | + |
| 86 | + # TODO: multi-processing |
| 87 | + unlabeled_dataset = [] |
| 88 | + for c, nt in zip(unlabeled_comments, unlabeled_news_titles): |
| 89 | + d = {'comments': c, 'news_title': nt} |
| 90 | + unlabeled_dataset.append(d) |
| 91 | + return unlabeled_dataset |
| 92 | + |
| 93 | + |
| 94 | +def _load_testset(): |
| 95 | + """Load testset |
| 96 | +
|
| 97 | + Note that testset doesn't contain any labels |
| 98 | +
|
| 99 | + Returns: |
| 100 | + testset (list of dict): |
| 101 | + [ |
| 102 | + { |
| 103 | + 'comments': str, |
| 104 | + 'news_title': str, |
| 105 | + }, ... |
| 106 | + ] |
| 107 | + """ |
| 108 | + test = pd.read_csv(f'{datadir}/test.no_label.tsv', sep='\t') |
| 109 | + test_news_title = read_lines(f'{datadir}/news_title/test.news_title.txt') |
| 110 | + assert test.shape[0] == len(test_news_title) |
| 111 | + |
| 112 | + test['news_title'] = test_news_title |
| 113 | + return test.to_dict('records') |
| 114 | + |
| 115 | + |
| 116 | +AVAILABLE_MODE = { |
| 117 | + 'labeled': _load_labeled, |
| 118 | + 'unlabeled': _load_unlabeled, |
| 119 | + 'testset': _load_testset |
| 120 | +} |
| 121 | + |
| 122 | + |
| 123 | +def load(mode): |
| 124 | + """Load korean-hate-speech dataset |
| 125 | +
|
| 126 | + Args: |
| 127 | + mode (str): Either labeled, unlabeld, or testset |
| 128 | + """ |
| 129 | + if mode not in AVAILABLE_MODE: |
| 130 | + raise ValueError(f'Invalid mode. Try one of {AVAILABLE_MODE.keys()}') |
| 131 | + return AVAILABLE_MODE[mode]() |
0 commit comments