-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_rel.py
34 lines (28 loc) · 1.21 KB
/
fix_rel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os, io, sys
from tqdm import tqdm
#Remove ill-formed articles from the copora
data_dir = 'dataset_original'
for f in os.listdir(data_dir):
if f.split('.')[-1] == 'rel':
print "Processing: %s" % f
source, target = f.split('.')[0].split('2')
source_file = os.path.join('dataset', 'wiki_%s.queries' % source)
target_file = os.path.join('dataset', 'wiki_%s.documents' % target)
queries = {}
with open(source_file) as sourcef:
for l in sourcef:
id, title, query = l[:-1].split('\t')
queries[int(id)] = (title, query)
documents = {}
with open(target_file) as targetf:
for l in targetf:
id, title, doc = l[:-1].split('\t', 2)
documents[int(id)] = (title, doc)
outf = open(os.path.join('dataset', f), 'w')
with open(os.path.join(data_dir, f)) as relf:
for l in tqdm(relf.readlines()):
src, rand, target, rel = l[:-1].split('\t')
src = int(src)
target = int(target)
if src in queries and target in documents:
outf.write('\t'.join([str(src), str(target), rel])+'\n')