-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
Copy pathusage2.py
107 lines (94 loc) · 3.61 KB
/
usage2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: UTF-8 -*-
'''
'''
import requests
import time
import threading
import urllib3
from fake_headers import Headers
import uuid
from geolite2 import geolite2
ips = []
# 爬数据的线程类
def getChinaIP(ip='127.0.0.1'):
reader = geolite2.reader()
ip_info = reader.get(ip)
geolite2.close()
print(ip_info)
return True if ip_info['country']['iso_code'] == 'CN' else False
class CrawlThread(threading.Thread):
def __init__(self, proxyip):
super(CrawlThread, self).__init__()
self.proxyip = proxyip
def run(self):
# 开始计时
pure_ip_address = self.proxyip.split(':')[0]
# 验证IP归属
if not getChinaIP(pure_ip_address):
# pass
raise ValueError('不是有效IP')
#
start = time.time()
# 消除关闭证书验证的警告
urllib3.disable_warnings()
headers = Headers(headers=True).generate()
<<<<<<< HEAD
# headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
# headers['Host'] = 'bb.cf08tp.cn'
# headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'PHPSESSID={}'.format(
''.join(str(uuid.uuid1()).split('-')))
# print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
=======
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
headers['Host'] = 'bb.cf08tp.cn'
headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'PHPSESSID={}'.format(
''.join(str(uuid.uuid1()).split('-')))
print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
>>>>>>> cb4cbc440e00a091773d985705d3f27b93a9213e
# 结束计时
end = time.time()
# 输出内容
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
# 获取代理IP的线程类
class GetIpThread(threading.Thread):
def __init__(self, fetchSecond):
super(GetIpThread, self).__init__()
self.fetchSecond = fetchSecond
def run(self):
global ips
while True:
# 获取IP列表
res = requests.get(apiUrl).content.decode()
# 按照\n分割获取到的IP
ips = res.split('\n')
# 利用每一个IP
for proxyip in ips:
if proxyip.strip():
# 开启一个线程
# CrawlThread(proxyip).start()
try:
CrawlThread(proxyip).run()
time.sleep(1.5)
except Exception as e:
print(e)
# 休眠
time.sleep(len(ips) /self.fetchSecond )
if __name__ == '__main__':
# 获取IP的API接口
# apiUrl = "http://127.0.0.1:5555/all"
apiUrl = "http://127.0.0.1:5555/random"
# 要抓取的目标网站地址
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
fetchSecond = 5
# 开始自动获取IP
GetIpThread(fetchSecond).start()