3 years ago · 08385f6463
--- a/proxypool/crawlers/base.py
+++ b/proxypool/crawlers/base.py
@@ -1,4 +1,4 @@
 
				-from retrying import retry
			
 
				+from retrying import RetryError, retry
			
 
				 import requests
			
 
				 from loguru import logger
			
 
				 from proxypool.setting import GET_TIMEOUT
			
@@ -23,15 +23,25 @@ class BaseCrawler(object):
 
				         except requests.ConnectionError:
			
 
				             return
			
 
				 
			
 
				-    @logger.catch
			
 
				+    def process(self, html, url):
			
 
				+        """
			
 
				+        used for parse html
			
 
				+        """
			
 
				+        for proxy in self.parse(html):
			
 
				+            logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				+            yield proxy
			
 
				+
			
 
				     def crawl(self):
			
 
				         """
			
 
				         crawl main method
			
 
				         """
			
 
				-        for url in self.urls:
			
 
				-            logger.info(f'fetching {url}')
			
 
				-            html = self.fetch(url)
			
 
				-            time.sleep(.5)
			
 
				-            for proxy in self.parse(html):
			
 
				-                logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				-                yield proxy
			
 
				+        try:
			
 
				+            for url in self.urls:
			
 
				+                logger.info(f'fetching {url}')
			
 
				+                html = self.fetch(url)
			
 
				+                time.sleep(.5)
			
 
				+                yield from self.process(html, url)
			
 
				+        except RetryError:
			
 
				+            logger.error(
			
 
				+                f'crawler {self} crawled proxy unsuccessfully, '
			
 
				+                'please check if target url is valid or network issue')
			
--- a/proxypool/crawlers/public/data5u.py
+++ b/proxypool/crawlers/public/data5u.py
@@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler):
 
				     data5u crawler, http://www.data5u.com
			
 
				     """
			
 
				     urls = [BASE_URL]
			
 
				-    
			
 
				-    headers = {
			
 
				-        'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
			
 
				-    }
			
 
				 
			
 
				-    @logger.catch
			
 
				-    def crawl(self):
			
 
				-        """
			
 
				-        crawl main method
			
 
				-        """
			
 
				-        for url in self.urls:
			
 
				-            logger.info(f'fetching {url}')
			
 
				-            html = self.fetch(url, headers=self.headers)
			
 
				-            for proxy in self.parse(html):
			
 
				-                logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				-                yield proxy
			
 
				-    
			
 
				     def parse(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
--- a/proxypool/crawlers/public/fatezero_proxylist.py
+++ b/proxypool/crawlers/public/fatezero_proxylist.py
--- a/proxypool/crawlers/public/ihuan.py
+++ b/proxypool/crawlers/public/ihuan.py
@@ -13,6 +13,7 @@ class IhuanCrawler(BaseCrawler):
 
				     path = time.strftime("%Y/%m/%d/%H", time.localtime())
			
 
				     urls = [BASE_URL.format(path=path)]
			
 
				     ignore = False
			
 
				+
			
 
				     def parse(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
--- a/proxypool/crawlers/public/jiangxianli.py
+++ b/proxypool/crawlers/public/jiangxianli.py
@@ -1,23 +1,27 @@
 
				 from proxypool.schemas.proxy import Proxy
			
 
				 from proxypool.crawlers.base import BaseCrawler
			
 
				-import re
			
 
				 import json
			
 
				+
			
 
				+
			
 
				 BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
			
 
				 
			
 
				 MAX_PAGE = 10
			
 
				+
			
 
				+
			
 
				 class JiangxianliCrawler(BaseCrawler):
			
 
				     """
			
 
				     jiangxianli crawler,https://ip.jiangxianli.com/
			
 
				     """
			
 
				+
			
 
				     urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
			
 
				-    
			
 
				+
			
 
				     def parse(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
 
				         :return:
			
 
				         """
			
 
				-        
			
 
				-        result =json.loads(html)
			
 
				+
			
 
				+        result = json.loads(html)
			
 
				         if result['code'] != 0:
			
 
				             return
			
 
				         MAX_PAGE = int(result['data']['last_page'])
			
--- a/proxypool/crawlers/public/xiaoshudaili.py
+++ b/proxypool/crawlers/public/xiaoshudaili.py
@@ -1,7 +1,5 @@
 
				 import re

			
 
				-

			
 
				 from pyquery import PyQuery as pq

			
 
				-

			
 
				 from proxypool.schemas.proxy import Proxy

			
 
				 from proxypool.crawlers.base import BaseCrawler

			
 
				 

			
@@ -16,16 +14,23 @@ class XiaoShuCrawler(BaseCrawler):
 
				     """

			
 
				 

			
 
				     def __init__(self):

			
 
				-        html = self.fetch(url=BASE_URL)

			
 
				+        """

			
 
				+        init urls

			
 
				+        """

			
 
				+        try:

			
 
				+            html = self.fetch(url=BASE_URL)

			
 
				+        except:

			
 
				+            self.urls = []

			
 
				+            return

			
 
				         doc = pq(html)

			
 
				         title = doc(".title:eq(0) a").items()

			
 
				-

			
 
				         latest_page = 0

			
 
				         for t in title:

			
 
				             res = re.search(r"/(\d+)\.html", t.attr("href"))

			
 
				             latest_page = int(res.group(1)) if res else 0

			
 
				         if latest_page:

			
 
				-            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)]

			
 
				+            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(

			
 
				+                latest_page - MAX_PAGE, latest_page)]

			
 
				         else:

			
 
				             self.urls = []

			
 
				 

			
--- a/proxypool/crawlers/public/xicidaili.py
+++ b/proxypool/crawlers/public/xicidaili.py
@@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler):
 
				     """
			
 
				     urls = [BASE_URL]
			
 
				     ignore = True
			
 
				-    
			
 
				-    headers = {
			
 
				-        'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
			
 
				-    }
			
 
				 
			
 
				-    @logger.catch
			
 
				-    def crawl(self):
			
 
				-        """
			
 
				-        crawl main method
			
 
				-        """
			
 
				-        for url in self.urls:
			
 
				-            logger.info(f'fetching {url}')
			
 
				-            html = self.fetch(url, headers=self.headers)
			
 
				-            for proxy in self.parse(html):
			
 
				-                logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				-                yield proxy
			
 
				-    
			
 
				     def parse(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
@@ -49,4 +33,3 @@ if __name__ == '__main__':
 
				     crawler = XicidailiCrawler()
			
 
				     for proxy in crawler.crawl():
			
 
				         print(proxy)
			
 
				-
			
--- a/proxypool/crawlers/public/zhandaye.py
+++ b/proxypool/crawlers/public/zhandaye.py
@@ -8,6 +8,7 @@ import re
 
				 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
			
 
				 MAX_PAGE = 5 * 2
			
 
				 
			
 
				+
			
 
				 class ZhandayeCrawler(BaseCrawler):
			
 
				     """
			
 
				     zhandaye crawler, https://www.zdaye.com/dayProxy/
			
@@ -56,4 +57,3 @@ if __name__ == '__main__':
 
				     crawler = ZhandayeCrawler()
			
 
				     for proxy in crawler.crawl():
			
 
				         print(proxy)
			
 
				-
			
--- a/proxypool/processors/getter.py
+++ b/proxypool/processors/getter.py
@@ -8,7 +8,7 @@ class Getter(object):
 
				     """
			
 
				     getter of proxypool
			
 
				     """
			
 
				-    
			
 
				+
			
 
				     def __init__(self):
			
 
				         """
			
 
				         init db and crawlers
			
@@ -16,14 +16,14 @@ class Getter(object):
 
				         self.redis = RedisClient()
			
 
				         self.crawlers_cls = crawlers_cls
			
 
				         self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
			
 
				-    
			
 
				+
			
 
				     def is_full(self):
			
 
				         """
			
 
				         if proxypool if full
			
 
				         return: bool
			
 
				         """
			
 
				         return self.redis.count() >= PROXY_NUMBER_MAX
			
 
				-    
			
 
				+
			
 
				     @logger.catch
			
 
				     def run(self):
			
 
				         """