5 yıl önce · 737998f2f0
--- a/proxypool/crawlers/__init__.py
+++ b/proxypool/crawlers/__init__.py
@@ -1,8 +1,8 @@
 
				 import pkgutil
			
 
				 from .base import BaseCrawler
			
 
				-from .public.zhandaye import ZhandayeDetailCrawler
			
 
				 import inspect
			
 
				 
			
 
				+
			
 
				 # load classes subclass of BaseCrawler
			
 
				 classes = []
			
 
				 for loader, name, is_pkg in pkgutil.walk_packages(__path__):
			
@@ -13,3 +13,4 @@ for loader, name, is_pkg in pkgutil.walk_packages(__path__):
 
				                 and not getattr(value, 'ignore', False):
			
 
				             classes.append(value)
			
 
				 __all__ = __ALL__ = classes
			
 
				+
			
--- a/proxypool/crawlers/base.py
+++ b/proxypool/crawlers/base.py
@@ -1,6 +1,7 @@
 
				 from retrying import retry
			
 
				 import requests
			
 
				 from loguru import logger
			
 
				+from proxypool.setting import GET_TIMEOUT
			
 
				 
			
 
				 
			
 
				 class BaseCrawler(object):
			
@@ -9,8 +10,11 @@ class BaseCrawler(object):
 
				     @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
			
 
				     def fetch(self, url, **kwargs):
			
 
				         try:
			
 
				+            kwargs.setdefault('timeout', GET_TIMEOUT)
			
 
				+            kwargs.setdefault('verify', False)
			
 
				             response = requests.get(url, **kwargs)
			
 
				             if response.status_code == 200:
			
 
				+                response.encoding = 'utf-8'
			
 
				                 return response.text
			
 
				         except requests.ConnectionError:
			
 
				             return
			
@@ -23,7 +27,6 @@ class BaseCrawler(object):
 
				         for url in self.urls:
			
 
				             logger.info(f'fetching {url}')
			
 
				             html = self.fetch(url)
			
 
				-            print('html', html)
			
 
				             for proxy in self.parse(html):
			
 
				                 logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				                 yield proxy
			
--- a/proxypool/crawlers/public/iphai.py
+++ b/proxypool/crawlers/public/iphai.py
@@ -10,7 +10,7 @@ class IPHaiCrawler(BaseCrawler):
 
				     iphai crawler, http://www.iphai.com/
			
 
				     """
			
 
				     urls = [BASE_URL]
			
 
				-    
			
 
				+    ignore = True
			
 
				     
			
 
				     def parse(self, html):
			
 
				         """
			
@@ -32,3 +32,4 @@ if __name__ == '__main__':
 
				     crawler = IPHaiCrawler()
			
 
				     for proxy in crawler.crawl():
			
 
				         print(proxy)
			
 
				+
			
--- a/proxypool/crawlers/public/kuaidaili.py
+++ b/proxypool/crawlers/public/kuaidaili.py
@@ -5,13 +5,14 @@ from pyquery import PyQuery as pq
 
				 
			
 
				 
			
 
				 BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
			
 
				+MAX_PAGE = 5
			
 
				 
			
 
				 
			
 
				 class KuaidailiCrawler(BaseCrawler):
			
 
				     """
			
 
				     kuaidaili crawler, https://www.kuaidaili.com/
			
 
				     """
			
 
				-    urls = [BASE_URL.format(page=page) for page in range(1, 200)]
			
 
				+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
			
 
				     
			
 
				     def parse(self, html):
			
 
				         """
			
--- a/proxypool/crawlers/public/xicidaili.py
+++ b/proxypool/crawlers/public/xicidaili.py
@@ -11,6 +11,7 @@ class XicidailiCrawler(BaseCrawler):
 
				     xididaili crawler, https://www.xicidaili.com/
			
 
				     """
			
 
				     urls = [BASE_URL]
			
 
				+    ignore = True
			
 
				     
			
 
				     headers = {
			
 
				         'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
			
@@ -48,3 +49,4 @@ if __name__ == '__main__':
 
				     crawler = XicidailiCrawler()
			
 
				     for proxy in crawler.crawl():
			
 
				         print(proxy)
			
 
				+
			
--- a/proxypool/crawlers/public/zhandaye.py
+++ b/proxypool/crawlers/public/zhandaye.py
@@ -2,6 +2,7 @@ from pyquery import PyQuery as pq
 
				 from proxypool.schemas.proxy import Proxy
			
 
				 from proxypool.crawlers.base import BaseCrawler
			
 
				 from loguru import logger
			
 
				+import re
			
 
				 
			
 
				 
			
 
				 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
			
@@ -11,50 +12,47 @@ class ZhandayeCrawler(BaseCrawler):
 
				     """
			
 
				     zhandaye crawler, https://www.zdaye.com/dayProxy/
			
 
				     """
			
 
				-    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
			
 
				-
			
 
				+    urls_catalog = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
			
 
				     headers = {
			
 
				         'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
			
 
				     }
			
 
				+    urls = []
			
 
				 
			
 
				     def crawl(self):
			
 
				-        for url in self.urls:
			
 
				+        self.crawl_catalog()
			
 
				+        yield from super().crawl()
			
 
				+
			
 
				+    def crawl_catalog(self):
			
 
				+        for url in self.urls_catalog:
			
 
				             logger.info(f'fetching {url}')
			
 
				             html = self.fetch(url, headers=self.headers)
			
 
				-            self.parse(html)
			
 
				+            self.parse_catalog(html)
			
 
				 
			
 
				-    def parse(self, html):
			
 
				+    def parse_catalog(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
 
				         :return:
			
 
				         """
			
 
				         doc = pq(html)
			
 
				         for item in doc('#J_posts_list .thread_item div div p a').items():
			
 
				-            post = 'https://www.zdaye.com' + item.attr('href')
			
 
				-            logger.info(f'get detail url: {post}')
			
 
				-            ZhandayeDetailCrawler(post).crawl()
			
 
				-
			
 
				-
			
 
				-class ZhandayeDetailCrawler(BaseCrawler):
			
 
				-    urls = []
			
 
				-    ignore = True
			
 
				-
			
 
				-    def __init__(self, url):
			
 
				-        self.urls.append(url)
			
 
				-        super().__init__()
			
 
				+            url = 'https://www.zdaye.com' + item.attr('href')
			
 
				+            logger.info(f'get detail url: {url}')
			
 
				+            self.urls.append(url)
			
 
				 
			
 
				     def parse(self, html):
			
 
				         doc = pq(html)
			
 
				         trs = doc('.cont br').items()
			
 
				         for tr in trs:
			
 
				             line = tr[0].tail
			
 
				-            host = line.split(':')[0]
			
 
				-            port = line.split(':')[1][:4]
			
 
				-            yield Proxy(host=host, port=port)
			
 
				-
			
 
				+            match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
			
 
				+            if match:
			
 
				+                host = match.group(1)
			
 
				+                port = match.group(2)
			
 
				+                yield Proxy(host=host, port=port)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     crawler = ZhandayeCrawler()
			
 
				     for proxy in crawler.crawl():
			
 
				         print(proxy)
			
 
				+
			
--- a/proxypool/processors/getter.py
+++ b/proxypool/processors/getter.py
@@ -33,10 +33,11 @@ class Getter(object):
 
				         if self.is_full():
			
 
				             return
			
 
				         for crawler in self.crawlers:
			
 
				+            logger.info(f'crawler {crawler} to get proxy')
			
 
				             for proxy in crawler.crawl():
			
 
				                 self.redis.add(proxy)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     getter = Getter()
			
 
				-    getter.run()
			
 
				+    getter.run()
			
--- a/proxypool/processors/tester.py
+++ b/proxypool/processors/tester.py
@@ -74,14 +74,15 @@ class Tester(object):
 
				         logger.info('stating tester...')
			
 
				         count = self.redis.count()
			
 
				         logger.debug(f'{count} proxies to test')
			
 
				-        for i in range(0, count, TEST_BATCH):
			
 
				-            # start end end offset
			
 
				-            start, end = i, min(i + TEST_BATCH, count)
			
 
				-            logger.debug(f'testing proxies from {start} to {end} indices')
			
 
				-            proxies = self.redis.batch(start, end)
			
 
				-            tasks = [self.test(proxy) for proxy in proxies]
			
 
				-            # run tasks using event loop
			
 
				-            self.loop.run_until_complete(asyncio.wait(tasks))
			
 
				+        cursor = 0
			
 
				+        while True:
			
 
				+            logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}')
			
 
				+            cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
			
 
				+            if proxies:
			
 
				+                tasks = [self.test(proxy) for proxy in proxies]
			
 
				+                self.loop.run_until_complete(asyncio.wait(tasks))
			
 
				+            if not cursor:
			
 
				+                break
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/proxypool/setting.py
+++ b/proxypool/setting.py
@@ -53,6 +53,7 @@ PROXY_NUMBER_MIN = 0
 
				 CYCLE_TESTER = env.int('CYCLE_TESTER', 20)
			
 
				 # definition of getter cycle, it will get proxy every CYCLE_GETTER second
			
 
				 CYCLE_GETTER = env.int('CYCLE_GETTER', 100)
			
 
				+GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
			
 
				 
			
 
				 # definition of tester
			
 
				 TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
			
@@ -75,5 +76,6 @@ ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
 
				 ENABLE_GETTER = env.bool('ENABLE_GETTER', True)
			
 
				 ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
			
 
				 
			
 
				-logger.add(env.str('LOG_RUNTIME_FILE', 'runtime.log'), level='DEBUG', rotation='1 week', retention='20 days')
			
 
				-logger.add(env.str('LOG_ERROR_FILE', 'error.log'), level='ERROR', rotation='1 week')
			
 
				+logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days')
			
 
				+logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')
			
 
				+
			
--- a/proxypool/storages/redis.py
+++ b/proxypool/storages/redis.py
@@ -67,17 +67,15 @@ class RedisClient(object):
 
				         :param proxy: proxy
			
 
				         :return: new score
			
 
				         """
			
 
				-        score = self.db.zscore(REDIS_KEY, proxy.string())
			
 
				-        # current score is larger than PROXY_SCORE_MIN
			
 
				-        if score and score > PROXY_SCORE_MIN:
			
 
				-            logger.info(f'{proxy.string()} current score {score}, decrease 1')
			
 
				-            if IS_REDIS_VERSION_2:
			
 
				-                return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
			
 
				-            return self.db.zincrby(REDIS_KEY, -1, proxy.string())
			
 
				-        # otherwise delete proxy
			
 
				+        if IS_REDIS_VERSION_2:
			
 
				+            self.db.zincrby(REDIS_KEY, proxy.string(), -1)
			
 
				         else:
			
 
				+            self.db.zincrby(REDIS_KEY, -1, proxy.string())
			
 
				+        score = self.db.zscore(REDIS_KEY, proxy.string())
			
 
				+        logger.info(f'{proxy.string()} score decrease 1, current {score}')
			
 
				+        if score <= PROXY_SCORE_MIN:
			
 
				             logger.info(f'{proxy.string()} current score {score}, remove')
			
 
				-            return self.db.zrem(REDIS_KEY, proxy.string())
			
 
				+            self.db.zrem(REDIS_KEY, proxy.string())
			
 
				 
			
 
				     def exists(self, proxy: Proxy) -> bool:
			
 
				         """
			
@@ -112,17 +110,19 @@ class RedisClient(object):
 
				         """
			
 
				         return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
			
 
				 
			
 
				-    def batch(self, start, end) -> List[Proxy]:
			
 
				+    def batch(self, cursor, count) -> List[Proxy]:
			
 
				         """
			
 
				         get batch of proxies
			
 
				-        :param start: start index
			
 
				-        :param end: end index
			
 
				+        :param cursor: scan cursor
			
 
				+        :param count: scan count
			
 
				         :return: list of proxies
			
 
				         """
			
 
				-        return convert_proxy_or_proxies(self.db.zrevrange(REDIS_KEY, start, end - 1))
			
 
				+        cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
			
 
				+        return cursor, convert_proxy_or_proxies([i[0] for i in proxies])
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     conn = RedisClient()
			
 
				     result = conn.random()
			
 
				     print(result)
			
 
				+