Browse Source

catch retry error

Germey 3 years ago
parent
commit
08385f6463

+ 19 - 9
proxypool/crawlers/base.py

@@ -1,4 +1,4 @@
-from retrying import retry
+from retrying import RetryError, retry
 import requests
 from loguru import logger
 from proxypool.setting import GET_TIMEOUT
@@ -23,15 +23,25 @@ class BaseCrawler(object):
         except requests.ConnectionError:
             return
 
-    @logger.catch
+    def process(self, html, url):
+        """
+        used for parse html
+        """
+        for proxy in self.parse(html):
+            logger.info(f'fetched proxy {proxy.string()} from {url}')
+            yield proxy
+
     def crawl(self):
         """
         crawl main method
         """
-        for url in self.urls:
-            logger.info(f'fetching {url}')
-            html = self.fetch(url)
-            time.sleep(.5)
-            for proxy in self.parse(html):
-                logger.info(f'fetched proxy {proxy.string()} from {url}')
-                yield proxy
+        try:
+            for url in self.urls:
+                logger.info(f'fetching {url}')
+                html = self.fetch(url)
+                time.sleep(.5)
+                yield from self.process(html, url)
+        except RetryError:
+            logger.error(
+                f'crawler {self} crawled proxy unsuccessfully, '
+                'please check if target url is valid or network issue')

+ 0 - 16
proxypool/crawlers/public/data5u.py

@@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler):
     data5u crawler, http://www.data5u.com
     """
     urls = [BASE_URL]
-    
-    headers = {
-        'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
-    }
 
-    @logger.catch
-    def crawl(self):
-        """
-        crawl main method
-        """
-        for url in self.urls:
-            logger.info(f'fetching {url}')
-            html = self.fetch(url, headers=self.headers)
-            for proxy in self.parse(html):
-                logger.info(f'fetched proxy {proxy.string()} from {url}')
-                yield proxy
-    
     def parse(self, html):
         """
         parse html file to get proxies

+ 0 - 0
proxypool/crawlers/public/fatezero_proxylist.py → proxypool/crawlers/public/fatezero.py


+ 1 - 0
proxypool/crawlers/public/ihuan.py

@@ -13,6 +13,7 @@ class IhuanCrawler(BaseCrawler):
     path = time.strftime("%Y/%m/%d/%H", time.localtime())
     urls = [BASE_URL.format(path=path)]
     ignore = False
+
     def parse(self, html):
         """
         parse html file to get proxies

+ 8 - 4
proxypool/crawlers/public/jiangxianli.py

@@ -1,23 +1,27 @@
 from proxypool.schemas.proxy import Proxy
 from proxypool.crawlers.base import BaseCrawler
-import re
 import json
+
+
 BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
 
 MAX_PAGE = 10
+
+
 class JiangxianliCrawler(BaseCrawler):
     """
     jiangxianli crawler,https://ip.jiangxianli.com/
     """
+
     urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
-    
+
     def parse(self, html):
         """
         parse html file to get proxies
         :return:
         """
-        
-        result =json.loads(html)
+
+        result = json.loads(html)
         if result['code'] != 0:
             return
         MAX_PAGE = int(result['data']['last_page'])

+ 10 - 5
proxypool/crawlers/public/xiaoshudaili.py

@@ -1,7 +1,5 @@
 import re
-
 from pyquery import PyQuery as pq
-
 from proxypool.schemas.proxy import Proxy
 from proxypool.crawlers.base import BaseCrawler
 
@@ -16,16 +14,23 @@ class XiaoShuCrawler(BaseCrawler):
     """
 
     def __init__(self):
-        html = self.fetch(url=BASE_URL)
+        """
+        init urls
+        """
+        try:
+            html = self.fetch(url=BASE_URL)
+        except:
+            self.urls = []
+            return
         doc = pq(html)
         title = doc(".title:eq(0) a").items()
-
         latest_page = 0
         for t in title:
             res = re.search(r"/(\d+)\.html", t.attr("href"))
             latest_page = int(res.group(1)) if res else 0
         if latest_page:
-            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)]
+            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
+                latest_page - MAX_PAGE, latest_page)]
         else:
             self.urls = []
 

+ 0 - 17
proxypool/crawlers/public/xicidaili.py

@@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler):
     """
     urls = [BASE_URL]
     ignore = True
-    
-    headers = {
-        'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
-    }
 
-    @logger.catch
-    def crawl(self):
-        """
-        crawl main method
-        """
-        for url in self.urls:
-            logger.info(f'fetching {url}')
-            html = self.fetch(url, headers=self.headers)
-            for proxy in self.parse(html):
-                logger.info(f'fetched proxy {proxy.string()} from {url}')
-                yield proxy
-    
     def parse(self, html):
         """
         parse html file to get proxies
@@ -49,4 +33,3 @@ if __name__ == '__main__':
     crawler = XicidailiCrawler()
     for proxy in crawler.crawl():
         print(proxy)
-

+ 1 - 1
proxypool/crawlers/public/zhandaye.py

@@ -8,6 +8,7 @@ import re
 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
 MAX_PAGE = 5 * 2
 
+
 class ZhandayeCrawler(BaseCrawler):
     """
     zhandaye crawler, https://www.zdaye.com/dayProxy/
@@ -56,4 +57,3 @@ if __name__ == '__main__':
     crawler = ZhandayeCrawler()
     for proxy in crawler.crawl():
         print(proxy)
-

+ 3 - 3
proxypool/processors/getter.py

@@ -8,7 +8,7 @@ class Getter(object):
     """
     getter of proxypool
     """
-    
+
     def __init__(self):
         """
         init db and crawlers
@@ -16,14 +16,14 @@ class Getter(object):
         self.redis = RedisClient()
         self.crawlers_cls = crawlers_cls
         self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
-    
+
     def is_full(self):
         """
         if proxypool if full
         return: bool
         """
         return self.redis.count() >= PROXY_NUMBER_MAX
-    
+
     @logger.catch
     def run(self):
         """