Skip to content

Commit b91ef8f

Browse files
committed
Release koco-v0.1.1
1 parent c3c7adc commit b91ef8f

9 files changed

+265
-0
lines changed

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include requirements.txt

koco/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from koco.about import __version__, __description__ # noqa: F401
2+
from koco.load import list_datasets, load_dataset # noqa: F401
3+
4+
from koco import korean_hate_speech # noqa: F401

koco/korean_hate_speech/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .korean_hate_speech import load # noqa: F401
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Load kocohub/korean-hate-speech dataset
2+
3+
The dataset contains
4+
- labeled train, dev set
5+
- label-removed test set
6+
- unlabeled data
7+
- news title for each train, dev, test, and unlabeled corpus
8+
9+
For more information, see https://github.com/kocohub/korean-hate-speech
10+
"""
11+
12+
import pandas as pd
13+
14+
from ..utils import DOWNLOAD_DIR, read_lines
15+
16+
dataset = 'korean-hate-speech'
17+
datadir = f'{DOWNLOAD_DIR}/{dataset}-master'
18+
19+
20+
def _load_labeled():
21+
"""Load labeled train, dev set
22+
23+
Returns:
24+
labeled_dataset (dict):
25+
{
26+
'train': [
27+
{
28+
'comments': str,
29+
'contain_gender_bias: bool,
30+
'bias': str,
31+
'hate': str,
32+
'news_title': str,
33+
},
34+
...
35+
]
36+
37+
'dev': [
38+
{
39+
'comments': str,
40+
'contain_gender_bias: bool,
41+
'bias': str,
42+
'hate': str,
43+
'news_title': str,
44+
},
45+
...
46+
]
47+
}
48+
"""
49+
train = pd.read_csv(f'{datadir}/labeled/train.tsv', sep='\t')
50+
dev = pd.read_csv(f'{datadir}/labeled/dev.tsv', sep='\t')
51+
train_news_title = read_lines(f'{datadir}/news_title/train.news_title.txt')
52+
dev_news_title = read_lines(f'{datadir}/news_title/dev.news_title.txt')
53+
assert train.shape[0] == len(train_news_title)
54+
assert dev.shape[0] == len(dev_news_title)
55+
56+
train['news_title'] = train_news_title
57+
dev['news_title'] = dev_news_title
58+
59+
labeled_dataset = dict()
60+
labeled_dataset['train'] = train.to_dict('records')
61+
labeled_dataset['dev'] = dev.to_dict('records')
62+
return labeled_dataset
63+
64+
65+
def _load_unlabeled():
66+
"""Load unlabeled corpus
67+
68+
Returns:
69+
unlabeled_dataset (list of dict):
70+
[
71+
{
72+
'comments': str,
73+
'news_title': str,
74+
}, ...
75+
]
76+
"""
77+
unlabeled_comments = []
78+
unlabeled_news_titles = []
79+
for i in range(5):
80+
unlabeled_comments_tmp = read_lines(f'{datadir}/unlabeled/unlabeled_comments_{i}.txt')
81+
unlabeled_comments.extend(unlabeled_comments_tmp)
82+
unlabeled_news_title_tmp = read_lines(f'{datadir}/news_title/unlabeled_comments.news_title_{i}.txt')
83+
unlabeled_news_titles.extend(unlabeled_news_title_tmp)
84+
assert len(unlabeled_comments) == len(unlabeled_news_titles)
85+
86+
# TODO: multi-processing
87+
unlabeled_dataset = []
88+
for c, nt in zip(unlabeled_comments, unlabeled_news_titles):
89+
d = {'comments': c, 'news_title': nt}
90+
unlabeled_dataset.append(d)
91+
return unlabeled_dataset
92+
93+
94+
def _load_testset():
95+
"""Load testset
96+
97+
Note that testset doesn't contain any labels
98+
99+
Returns:
100+
testset (list of dict):
101+
[
102+
{
103+
'comments': str,
104+
'news_title': str,
105+
}, ...
106+
]
107+
"""
108+
test = pd.read_csv(f'{datadir}/test.no_label.tsv', sep='\t')
109+
test_news_title = read_lines(f'{datadir}/news_title/test.news_title.txt')
110+
assert test.shape[0] == len(test_news_title)
111+
112+
test['news_title'] = test_news_title
113+
return test.to_dict('records')
114+
115+
116+
AVAILABLE_MODE = {
117+
'labeled': _load_labeled,
118+
'unlabeled': _load_unlabeled,
119+
'testset': _load_testset
120+
}
121+
122+
123+
def load(mode):
124+
"""Load korean-hate-speech dataset
125+
126+
Args:
127+
mode (str): Either labeled, unlabeld, or testset
128+
"""
129+
if mode not in AVAILABLE_MODE:
130+
raise ValueError(f'Invalid mode. Try one of {AVAILABLE_MODE.keys()}')
131+
return AVAILABLE_MODE[mode]()

koco/load.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import logging
2+
import requests
3+
4+
from .patch import download_dataset
5+
from .utils import DOWNLOAD_DIR, exist_dataset
6+
from .korean_hate_speech import load as khs_loader
7+
8+
9+
KOCOHUB = 'https://api.github.com/orgs/kocohub/repos'
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
def list_datasets():
15+
"""List datasets in kocohub
16+
"""
17+
success = False
18+
while not success:
19+
r = requests.get(KOCOHUB, params={'per_page': '500'})
20+
success = r.ok
21+
return [info['name'] for info in r.json()]
22+
23+
24+
def is_valid_dataset(dataset):
25+
all_datasets = list_datasets()
26+
if dataset in all_datasets:
27+
return True
28+
else:
29+
return False
30+
31+
32+
def patch_dataset(dataset, verbose=True):
33+
"""Download and unzip dataset from kocohub
34+
35+
Args:
36+
dataset (str): dataset name (e.g., korean-hate-speech)
37+
verbose (bool): whether to show dataset installation path
38+
"""
39+
if exist_dataset(dataset):
40+
if verbose:
41+
logger.info(f'{dataset} is already installed in {DOWNLOAD_DIR}.')
42+
else:
43+
if not is_valid_dataset(dataset):
44+
raise ValueError(f'{dataset} is not in {list_datasets()}')
45+
download_dataset(dataset, verbose=verbose)
46+
47+
48+
def load_dataset(dataset, mode, verbose=True):
49+
patch_dataset(dataset, verbose)
50+
51+
if dataset == 'korean-hate-speech':
52+
return khs_loader(mode)

koco/patch.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import logging
2+
import zipfile
3+
4+
import wget
5+
6+
from .utils import DOWNLOAD_DIR, make_dirs
7+
8+
baseurl = 'https://codeload.github.com/kocohub/{}/zip/master'
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def download_dataset(dataset, verbose=True):
14+
make_dirs(DOWNLOAD_DIR)
15+
url = baseurl.format(dataset)
16+
wget.download(url, f'{DOWNLOAD_DIR}/{dataset}.zip')
17+
unzip(f'{DOWNLOAD_DIR}/{dataset}.zip')
18+
if verbose:
19+
logger.info(f'Dataset {dataset} downloaded to {DOWNLOAD_DIR}.')
20+
21+
22+
def unzip(zippath):
23+
with zipfile.ZipFile(zippath) as z:
24+
z.extractall(DOWNLOAD_DIR)

koco/utils.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import os
2+
3+
DOWNLOAD_DIR = f'{os.path.expanduser("~")}/.kocohub'
4+
5+
6+
def exist_dataset(dataset):
7+
return os.path.exists(f'{DOWNLOAD_DIR}/{dataset}-master')
8+
9+
10+
def exist_dir(dirpath):
11+
return os.path.exists(dirpath)
12+
13+
14+
def make_dirs(dirpath):
15+
if not exist_dir(dirpath):
16+
os.makedirs(dirpath)
17+
18+
19+
def read_lines(path):
20+
return [line.rstrip('\n') for line in open(path)]

requirements.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pandas>=1.0.3
2+
requests>=2.23.0
3+
wget>=3.2

setup.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
from setuptools import find_packages, setup
3+
4+
REQ_FILE = 'requirements.txt'
5+
VERSION = '0.1.1'
6+
7+
8+
def get_requires():
9+
thisdir = os.path.dirname(__file__)
10+
reqpath = os.path.join(thisdir, REQ_FILE)
11+
return [line.rstrip('\n') for line in open(reqpath)]
12+
13+
14+
setup(
15+
name='koco',
16+
version=VERSION,
17+
description='A library to easily access kocohub datasets',
18+
author='Jihyung Moon',
19+
author_email='[email protected]',
20+
url='https://github.com/inmoonlight/koco',
21+
license='MIT',
22+
packages=find_packages(),
23+
install_requires=get_requires(),
24+
classifiers=[
25+
"Programming Language :: Python :: 3",
26+
"Intended Audience :: Science/Research",
27+
],
28+
keywords='korean nlp datasets',
29+
)

0 commit comments

Comments
 (0)