-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwoshipm_dl.py
132 lines (119 loc) · 3.7 KB
/
woshipm_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sys
import requests
from pyquery import PyQuery as pq
import json
import subprocess as subp
from os import path
crawler_config = {
"name": "",
"url": "http://www.woshipm.com/",
"link": "",
"title": "h2.article--title",
"content": ".article--content",
"remove": ".article--actions, .article-bottomAd, .pm-adTitle, .support-author",
"optiMode": "thres",
"list": []
}
def get_info(html):
root = pq(html)
el_pgnums = root('.nav-links .page-numbers:not(.dots):not(.prev):not(.next)')
if len(el_pgnums) == 0:
total = 1
else:
total = int(
el_pgnums.eq(len(el_pgnums) - 1)
.text().strip()
.replace(',', '')
)
cate = root('title').text().split(' | ')[0]
return {'total': total, 'cate': cate}
def get_dt_range(html):
root = pq(html)
el_times = root('time')
return {
'ed': el_times.eq(0).text().replace('-', ''),
'st': el_times.eq(len(el_times) - 1).text(). replace('-', ''),
}
def get_toc(html):
root = pq(html)
el_links = root('h2.post-title>a')
el_times = root('time')
return [
{
'link': el_links.eq(i).attr('href'),
'dt': el_times.eq(i).text().replace('-', ''),
} for i in range(len(el_links))
]
def get_first_pg(cate, dt, total):
st = 1
ed = total
while st <= ed:
mid = (st + ed) // 2
url = f'http://www.woshipm.com/category/{cate}/page/{mid}'
html = requests.get(url).text
dt_range = get_dt_range(html)
print(mid, dt_range)
if dt >= dt_range['st'] and \
dt <= dt_range['ed']:
return mid
elif dt < dt_range['st']:
st = mid + 1
else:
ed = mid - 1
return -1
def download(cate, stdt, eddt):
config = crawler_config.copy()
url = f'http://www.woshipm.com/category/{cate}'
html = requests.get(url).text
info = get_info(html)
print(info)
cateName = info['cate']
config['name'] = f'人人都是产品经理社区:{cateName}分类 {stdt}-{eddt}'
if path.exists(config['name'] + '.epub'):
print('已存在')
return
st = get_first_pg(cate, eddt, info['total'])
stop = False
for i in range(st, info['total'] + 1):
if stop: break
print(f'page: {i}')
url = f'http://www.woshipm.com/category/{cate}/page/{i}'
html = requests.get(url).text
pgtoc = get_toc(html)
if len(pgtoc) == 0: break
for art in pgtoc:
if art['dt'] > eddt: continue
if art['dt'] < stdt:
stop = True
break
print(art)
config['list'].append(art['link'])
fname = f'config_woshipm_{cate}_{stdt}_{eddt}.json'
open(fname, 'w').write(json.dumps(config))
subp.Popen(f'crawl-epub {fname}', shell=True).communicate()
def download_year(cate, year):
is_leap = lambda year: \
year % 4 == 0 and year % 100 != 0 or year % 400 == 0
yr_map = {
1: 31,
2: 28 if not is_leap(int(year)) else 29,
3: 31,
4: 30,
5: 31,
6: 30,
7: 31,
8: 31,
9: 30,
10: 31,
11: 30,
12: 31,
}
for mon, nd in yr_map.items():
download(cate, f'{year}{mon:02d}01', f'{year}{mon:02d}{nd:02d}')
def main():
cmd = sys.argv[1]
if cmd == 'dl':
download(sys.argv[2], sys.argv[3], sys.argv[4])
elif cmd == 'dlyr':
download_year(sys.argv[2], sys.argv[3])
if __name__ == '__main__': main()