-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsavehtml.py
59 lines (51 loc) · 1.77 KB
/
savehtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import warc
from assessment.models import *
import mongoengine
import pymongo
#mongoengine.connect(db=ntcir)
con = pymongo.Connection('localhost', 27017)
ntcir = con.ntcir
topic=ntcir.ntcir
table=ntcir.table
table.insert({'table_id':'1','table':'dog'})
table.insert({'table_id':'2','table':'cat'})
table.insert({'table_id':'3','table':'mouse'})
f=warc.open('/Users/Fan/Downloads/0000tw-00.warc.gz')
i=0
for record in f:
h = record.header
test = record.payload.read()
WARC_Trec_ID = h.get("WARC-Trec-ID")
i += 1
if i > 1:
filepath='/Users/Fan/Downloads/NTCIRMDB2/assessment/templates/'+ str(WARC_Trec_ID)+'.html'
with open(filepath,'w') as d:
d.write(test)
d.close()
topic.insert({'topic_id':'1','topic':'dog','title_id':str(i-1),'title':str(WARC_Trec_ID),'url':str(WARC_Trec_ID)+'.html'})
f=warc.open('/Users/Fan/Downloads/0000wb-00.warc.gz')
i=0
for record in f:
h = record.header
test = record.payload.read()
WARC_Trec_ID = h.get("WARC-Trec-ID")
i += 1
if i > 1:
filepath='/Users/Fan/Downloads/NTCIRMDB2/assessment/templates/'+ str(WARC_Trec_ID)+'.html'
with open(filepath,'w') as d:
d.write(test)
d.close()
topic.insert({'topic_id':'2','topic':'cat','title_id':str(i-1),'title':str(WARC_Trec_ID),'url':str(WARC_Trec_ID)+'.html'})
f=warc.open('/Users/Fan/Downloads/0000wt-00.warc.gz')
i=0
for record in f:
h = record.header
test = record.payload.read()
WARC_Trec_ID = h.get("WARC-Trec-ID")
i += 1
if i > 1:
filepath='/Users/Fan/Downloads/NTCIRMDB2/assessment/templates/'+ str(WARC_Trec_ID)+'.html'
with open(filepath,'w') as d:
d.write(test)
d.close()
topic.insert({'topic_id':'3','topic':'mouse','title_id':str(i-1),'title':str(WARC_Trec_ID), 'url':str(WARC_Trec_ID)+'.html'})