-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_query.py
61 lines (47 loc) · 1.81 KB
/
fix_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# encoding: utf-8
import sys, re, io
from nltk.tokenize import sent_tokenize as st
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
qids = {}
stopwords = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
with open('dataset/wiki_en.queries.ids', 'r') as f:
for l in f:
qids[float(l[:-1])] = 1
wiki_en_data_path = '/export/a14/shuo/wikiclir17/wiki/extracted_wiki/wiki_en.dat'
fgood = io.open('dataset/wiki_en.queries', 'w', encoding='utf-8')
fbad = io.open('dataset/wiki_en.queries.bad', 'w', encoding='utf-8')
with io.open(wiki_en_data_path, 'r', encoding='utf-8', errors='ignore') as f:
for l in f:
l = l.split('\t')
try:
did = int(l[0])
except:
continue
title = l[1]
doc = l[-1]
sys.stdout.write('\rHandling doc id: ' + str(did))
sys.stdout.flush() # important
if did in qids:
#do sentence boundary detection
if '\\n' in doc:
doc = '\\n'.join(doc.split('\\n')[1:])
doc = st(doc)[0]
if '\\n' in doc:
doc = doc.split('\\n')[0]
doc = doc.lower()
title_tokens = title.split()
if len(title_tokens) > 1:
doc = doc.replace(title.lower(), '')
title_tokens = [stemmer.stem(t.lower()) for t in title_tokens if t not in stopwords]
keep = []
for t in doc.split():
if stemmer.stem(t) not in title_tokens:
keep.append(t)
doc = ' '.join(keep)
output = '\t'.join([str(did), title, doc])+'\n'
if 'can refer to' in doc or 'may refer to' in doc or 'disambiguation' in title:
fbad.write(output)
else:
fgood.write(output)