Skip to content

Commit 78b3244

Browse files
authored
add sub proxy pool mechanics (#213)
1 parent 0033586 commit 78b3244

File tree

7 files changed

+111
-30
lines changed

7 files changed

+111
-30
lines changed

proxypool/processors/getter.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from proxypool.storages.redis import RedisClient
33
from proxypool.setting import PROXY_NUMBER_MAX
44
from proxypool.crawlers import __all__ as crawlers_cls
5-
5+
from proxypool.testers import __all__ as testers_cls
66

77
class Getter(object):
88
"""
@@ -16,6 +16,8 @@ def __init__(self):
1616
self.redis = RedisClient()
1717
self.crawlers_cls = crawlers_cls
1818
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
19+
self.testers_cls = testers_cls
20+
self.testers = [tester_cls() for tester_cls in self.testers_cls]
1921

2022
def is_full(self):
2123
"""
@@ -36,6 +38,7 @@ def run(self):
3638
logger.info(f'crawler {crawler} to get proxy')
3739
for proxy in crawler.crawl():
3840
self.redis.add(proxy)
41+
[self.redis.add(proxy, redis_key=tester.key) for tester in self.testers]
3942

4043

4144
if __name__ == '__main__':

proxypool/processors/server.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from flask import Flask, g, request
2+
from proxypool.exceptions import PoolEmptyException
23
from proxypool.storages.redis import RedisClient
3-
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV
4+
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED
45
import functools
56

67
__all__ = ['app']
@@ -53,10 +54,19 @@ def index():
5354
@auth_required
5455
def get_proxy():
5556
"""
56-
get a random proxy
57+
get a random proxy, can query the specific sub-pool according the (redis) key
58+
if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool
5759
:return: get a random proxy
5860
"""
61+
key = request.args.get('key')
5962
conn = get_conn()
63+
# return conn.random(key).string() if key else conn.random().string()
64+
if key:
65+
try:
66+
return conn.random(key).string()
67+
except PoolEmptyException:
68+
if not PROXY_RAND_KEY_DEGRADED:
69+
raise
6070
return conn.random().string()
6171

6272

@@ -67,8 +77,10 @@ def get_proxy_all():
6777
get a random proxy
6878
:return: get a random proxy
6979
"""
80+
key = request.args.get('key')
81+
7082
conn = get_conn()
71-
proxies = conn.all()
83+
proxies = conn.all(key) if key else conn.all()
7284
proxies_string = ''
7385
if proxies:
7486
for proxy in proxies:
@@ -85,7 +97,8 @@ def get_count():
8597
:return: count, int
8698
"""
8799
conn = get_conn()
88-
return str(conn.count())
100+
key = request.args.get('key')
101+
return str(conn.count(key)) if key else conn.count()
89102

90103

91104
if __name__ == '__main__':

proxypool/processors/tester.py

+28
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
TEST_DONT_SET_MAX_SCORE
88
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
99
from asyncio import TimeoutError
10+
from proxypool.testers import __all__ as testers_cls
1011

1112
EXCEPTIONS = (
1213
ClientProxyConnectionError,
@@ -30,6 +31,8 @@ def __init__(self):
3031
"""
3132
self.redis = RedisClient()
3233
self.loop = asyncio.get_event_loop()
34+
self.testers_cls = testers_cls
35+
self.testers = [tester_cls() for tester_cls in self.testers_cls]
3336

3437
async def test(self, proxy: Proxy):
3538
"""
@@ -63,8 +66,33 @@ async def test(self, proxy: Proxy):
6366
else:
6467
self.redis.decrease(proxy)
6568
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
69+
# if independent tester class found, create new set of storage and do the extra test
70+
for tester in self.testers:
71+
key = tester.key
72+
if self.redis.exists(proxy, key):
73+
test_url = tester.test_url
74+
headers = tester.headers()
75+
cookies = tester.cookies()
76+
async with session.get(test_url, proxy=f'http://{proxy.string()}',
77+
timeout=TEST_TIMEOUT,
78+
headers=headers,
79+
cookies=cookies,
80+
allow_redirects=False) as response:
81+
resp_text = await response.text()
82+
is_valid = await tester.parse(resp_text, test_url, proxy.string())
83+
if is_valid:
84+
if tester.test_dont_set_max_score:
85+
logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score')
86+
else:
87+
self.redis.max(proxy, key, tester.proxy_score_max)
88+
logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score')
89+
else:
90+
self.redis.decrease(proxy, tester.key, tester.proxy_score_min)
91+
logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score')
92+
6693
except EXCEPTIONS:
6794
self.redis.decrease(proxy)
95+
[self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers]
6896
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
6997

7098
@logger.catch

proxypool/setting.py

+2
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100)
5757
PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0)
5858
PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10)
59+
# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key
60+
PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True)
5961

6062
# definition of proxy number
6163
PROXY_NUMBER_MAX = 50000

proxypool/storages/redis.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db
3434
self.db = redis.StrictRedis(
3535
host=host, port=port, password=password, db=db, decode_responses=True, **kwargs)
3636

37-
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
37+
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int:
3838
"""
3939
add proxy and set it to init score
4040
:param proxy: proxy, ip:port, like 8.8.8.8:88
@@ -44,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
4444
if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
4545
logger.info(f'invalid proxy {proxy}, throw it')
4646
return
47-
if not self.exists(proxy):
47+
if not self.exists(proxy, redis_key):
4848
if IS_REDIS_VERSION_2:
49-
return self.db.zadd(REDIS_KEY, score, proxy.string())
50-
return self.db.zadd(REDIS_KEY, {proxy.string(): score})
49+
return self.db.zadd(redis_key, score, proxy.string())
50+
return self.db.zadd(redis_key, {proxy.string(): score})
5151

52-
def random(self) -> Proxy:
52+
def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy:
5353
"""
5454
get random proxy
5555
firstly try to get proxy with max score
@@ -59,74 +59,74 @@ def random(self) -> Proxy:
5959
"""
6060
# try to get proxy with max score
6161
proxies = self.db.zrangebyscore(
62-
REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
62+
redis_key, proxy_score_max, proxy_score_max)
6363
if len(proxies):
6464
return convert_proxy_or_proxies(choice(proxies))
6565
# else get proxy by rank
6666
proxies = self.db.zrevrange(
67-
REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
67+
redis_key, proxy_score_min, proxy_score_max)
6868
if len(proxies):
6969
return convert_proxy_or_proxies(choice(proxies))
7070
# else raise error
7171
raise PoolEmptyException
7272

73-
def decrease(self, proxy: Proxy) -> int:
73+
def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int:
7474
"""
7575
decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
7676
:param proxy: proxy
7777
:return: new score
7878
"""
7979
if IS_REDIS_VERSION_2:
80-
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
80+
self.db.zincrby(redis_key, proxy.string(), -1)
8181
else:
82-
self.db.zincrby(REDIS_KEY, -1, proxy.string())
83-
score = self.db.zscore(REDIS_KEY, proxy.string())
82+
self.db.zincrby(redis_key, -1, proxy.string())
83+
score = self.db.zscore(redis_key, proxy.string())
8484
logger.info(f'{proxy.string()} score decrease 1, current {score}')
85-
if score <= PROXY_SCORE_MIN:
85+
if score <= proxy_score_min:
8686
logger.info(f'{proxy.string()} current score {score}, remove')
87-
self.db.zrem(REDIS_KEY, proxy.string())
87+
self.db.zrem(redis_key, proxy.string())
8888

89-
def exists(self, proxy: Proxy) -> bool:
89+
def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool:
9090
"""
9191
if proxy exists
9292
:param proxy: proxy
9393
:return: if exists, bool
9494
"""
95-
return not self.db.zscore(REDIS_KEY, proxy.string()) is None
95+
return not self.db.zscore(redis_key, proxy.string()) is None
9696

97-
def max(self, proxy: Proxy) -> int:
97+
def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int:
9898
"""
9999
set proxy to max score
100100
:param proxy: proxy
101101
:return: new score
102102
"""
103-
logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
103+
logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}')
104104
if IS_REDIS_VERSION_2:
105-
return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
106-
return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
105+
return self.db.zadd(redis_key, proxy_score_max, proxy.string())
106+
return self.db.zadd(redis_key, {proxy.string(): proxy_score_max})
107107

108-
def count(self) -> int:
108+
def count(self, redis_key=REDIS_KEY) -> int:
109109
"""
110110
get count of proxies
111111
:return: count, int
112112
"""
113-
return self.db.zcard(REDIS_KEY)
113+
return self.db.zcard(redis_key)
114114

115-
def all(self) -> List[Proxy]:
115+
def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]:
116116
"""
117117
get all proxies
118118
:return: list of proxies
119119
"""
120-
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
120+
return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max))
121121

122-
def batch(self, cursor, count) -> List[Proxy]:
122+
def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]:
123123
"""
124124
get batch of proxies
125125
:param cursor: scan cursor
126126
:param count: scan count
127127
:return: list of proxies
128128
"""
129-
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
129+
cursor, proxies = self.db.zscan(redis_key, cursor, count=count)
130130
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])
131131

132132

proxypool/testers/__init__.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pkgutil
2+
from .base import BaseTester
3+
import inspect
4+
5+
6+
# load classes subclass of BaseCrawler
7+
classes = []
8+
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
9+
module = loader.find_module(name).load_module(name)
10+
for name, value in inspect.getmembers(module):
11+
globals()[name] = value
12+
if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \
13+
and not getattr(value, 'ignore', False):
14+
classes.append(value)
15+
__all__ = __ALL__ = classes
16+

proxypool/testers/base.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN
2+
3+
4+
class BaseTester(object):
5+
test_url = ""
6+
key = ""
7+
test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE
8+
proxy_score_init = PROXY_SCORE_INIT
9+
proxy_score_max = PROXY_SCORE_MAX
10+
proxy_score_min = PROXY_SCORE_MIN
11+
12+
def headers(self):
13+
return None
14+
15+
def cookies(self):
16+
return None
17+
18+
async def parse(self, html, url, proxy, expr='{"code":0'):
19+
return True if expr in html else False

0 commit comments

Comments
 (0)