Skip to content

Commit

Permalink
add sub proxy pool mechanics (#213)
Browse files Browse the repository at this point in the history
  • Loading branch information
inVains authored Mar 25, 2024
1 parent 0033586 commit 78b3244
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 30 deletions.
5 changes: 4 additions & 1 deletion proxypool/processors/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from proxypool.storages.redis import RedisClient
from proxypool.setting import PROXY_NUMBER_MAX
from proxypool.crawlers import __all__ as crawlers_cls

from proxypool.testers import __all__ as testers_cls

class Getter(object):
"""
Expand All @@ -16,6 +16,8 @@ def __init__(self):
self.redis = RedisClient()
self.crawlers_cls = crawlers_cls
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
self.testers_cls = testers_cls
self.testers = [tester_cls() for tester_cls in self.testers_cls]

def is_full(self):
"""
Expand All @@ -36,6 +38,7 @@ def run(self):
logger.info(f'crawler {crawler} to get proxy')
for proxy in crawler.crawl():
self.redis.add(proxy)
[self.redis.add(proxy, redis_key=tester.key) for tester in self.testers]


if __name__ == '__main__':
Expand Down
21 changes: 17 additions & 4 deletions proxypool/processors/server.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from flask import Flask, g, request
from proxypool.exceptions import PoolEmptyException
from proxypool.storages.redis import RedisClient
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED
import functools

__all__ = ['app']
Expand Down Expand Up @@ -53,10 +54,19 @@ def index():
@auth_required
def get_proxy():
"""
get a random proxy
get a random proxy, can query the specific sub-pool according the (redis) key
if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool
:return: get a random proxy
"""
key = request.args.get('key')
conn = get_conn()
# return conn.random(key).string() if key else conn.random().string()
if key:
try:
return conn.random(key).string()
except PoolEmptyException:
if not PROXY_RAND_KEY_DEGRADED:
raise
return conn.random().string()


Expand All @@ -67,8 +77,10 @@ def get_proxy_all():
get a random proxy
:return: get a random proxy
"""
key = request.args.get('key')

conn = get_conn()
proxies = conn.all()
proxies = conn.all(key) if key else conn.all()
proxies_string = ''
if proxies:
for proxy in proxies:
Expand All @@ -85,7 +97,8 @@ def get_count():
:return: count, int
"""
conn = get_conn()
return str(conn.count())
key = request.args.get('key')
return str(conn.count(key)) if key else conn.count()


if __name__ == '__main__':
Expand Down
28 changes: 28 additions & 0 deletions proxypool/processors/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
TEST_DONT_SET_MAX_SCORE
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
from asyncio import TimeoutError
from proxypool.testers import __all__ as testers_cls

EXCEPTIONS = (
ClientProxyConnectionError,
Expand All @@ -30,6 +31,8 @@ def __init__(self):
"""
self.redis = RedisClient()
self.loop = asyncio.get_event_loop()
self.testers_cls = testers_cls
self.testers = [tester_cls() for tester_cls in self.testers_cls]

async def test(self, proxy: Proxy):
"""
Expand Down Expand Up @@ -63,8 +66,33 @@ async def test(self, proxy: Proxy):
else:
self.redis.decrease(proxy)
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
# if independent tester class found, create new set of storage and do the extra test
for tester in self.testers:
key = tester.key
if self.redis.exists(proxy, key):
test_url = tester.test_url
headers = tester.headers()
cookies = tester.cookies()
async with session.get(test_url, proxy=f'http://{proxy.string()}',
timeout=TEST_TIMEOUT,
headers=headers,
cookies=cookies,
allow_redirects=False) as response:
resp_text = await response.text()
is_valid = await tester.parse(resp_text, test_url, proxy.string())
if is_valid:
if tester.test_dont_set_max_score:
logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score')
else:
self.redis.max(proxy, key, tester.proxy_score_max)
logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score')
else:
self.redis.decrease(proxy, tester.key, tester.proxy_score_min)
logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score')

except EXCEPTIONS:
self.redis.decrease(proxy)
[self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers]
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')

@logger.catch
Expand Down
2 changes: 2 additions & 0 deletions proxypool/setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100)
PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0)
PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10)
# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key
PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True)

# definition of proxy number
PROXY_NUMBER_MAX = 50000
Expand Down
50 changes: 25 additions & 25 deletions proxypool/storages/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db
self.db = redis.StrictRedis(
host=host, port=port, password=password, db=db, decode_responses=True, **kwargs)

def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int:
"""
add proxy and set it to init score
:param proxy: proxy, ip:port, like 8.8.8.8:88
Expand All @@ -44,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
logger.info(f'invalid proxy {proxy}, throw it')
return
if not self.exists(proxy):
if not self.exists(proxy, redis_key):
if IS_REDIS_VERSION_2:
return self.db.zadd(REDIS_KEY, score, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): score})
return self.db.zadd(redis_key, score, proxy.string())
return self.db.zadd(redis_key, {proxy.string(): score})

def random(self) -> Proxy:
def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy:
"""
get random proxy
firstly try to get proxy with max score
Expand All @@ -59,74 +59,74 @@ def random(self) -> Proxy:
"""
# try to get proxy with max score
proxies = self.db.zrangebyscore(
REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
redis_key, proxy_score_max, proxy_score_max)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else get proxy by rank
proxies = self.db.zrevrange(
REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
redis_key, proxy_score_min, proxy_score_max)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else raise error
raise PoolEmptyException

def decrease(self, proxy: Proxy) -> int:
def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int:
"""
decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
:param proxy: proxy
:return: new score
"""
if IS_REDIS_VERSION_2:
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
self.db.zincrby(redis_key, proxy.string(), -1)
else:
self.db.zincrby(REDIS_KEY, -1, proxy.string())
score = self.db.zscore(REDIS_KEY, proxy.string())
self.db.zincrby(redis_key, -1, proxy.string())
score = self.db.zscore(redis_key, proxy.string())
logger.info(f'{proxy.string()} score decrease 1, current {score}')
if score <= PROXY_SCORE_MIN:
if score <= proxy_score_min:
logger.info(f'{proxy.string()} current score {score}, remove')
self.db.zrem(REDIS_KEY, proxy.string())
self.db.zrem(redis_key, proxy.string())

def exists(self, proxy: Proxy) -> bool:
def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool:
"""
if proxy exists
:param proxy: proxy
:return: if exists, bool
"""
return not self.db.zscore(REDIS_KEY, proxy.string()) is None
return not self.db.zscore(redis_key, proxy.string()) is None

def max(self, proxy: Proxy) -> int:
def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int:
"""
set proxy to max score
:param proxy: proxy
:return: new score
"""
logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}')
if IS_REDIS_VERSION_2:
return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
return self.db.zadd(redis_key, proxy_score_max, proxy.string())
return self.db.zadd(redis_key, {proxy.string(): proxy_score_max})

def count(self) -> int:
def count(self, redis_key=REDIS_KEY) -> int:
"""
get count of proxies
:return: count, int
"""
return self.db.zcard(REDIS_KEY)
return self.db.zcard(redis_key)

def all(self) -> List[Proxy]:
def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]:
"""
get all proxies
:return: list of proxies
"""
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max))

def batch(self, cursor, count) -> List[Proxy]:
def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]:
"""
get batch of proxies
:param cursor: scan cursor
:param count: scan count
:return: list of proxies
"""
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
cursor, proxies = self.db.zscan(redis_key, cursor, count=count)
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])


Expand Down
16 changes: 16 additions & 0 deletions proxypool/testers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pkgutil
from .base import BaseTester
import inspect


# load classes subclass of BaseCrawler
classes = []
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
module = loader.find_module(name).load_module(name)
for name, value in inspect.getmembers(module):
globals()[name] = value
if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \
and not getattr(value, 'ignore', False):
classes.append(value)
__all__ = __ALL__ = classes

19 changes: 19 additions & 0 deletions proxypool/testers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN


class BaseTester(object):
test_url = ""
key = ""
test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE
proxy_score_init = PROXY_SCORE_INIT
proxy_score_max = PROXY_SCORE_MAX
proxy_score_min = PROXY_SCORE_MIN

def headers(self):
return None

def cookies(self):
return None

async def parse(self, html, url, proxy, expr='{"code":0'):
return True if expr in html else False

0 comments on commit 78b3244

Please sign in to comment.