2 years ago · 4c50711dde
--- a/proxypool/crawlers/public/geonodedaili.py
+++ b/proxypool/crawlers/public/geonodedaili.py
@@ -0,0 +1,71 @@
 
				+import time
			
 
				+from retrying import RetryError
			
 
				+from loguru import logger
			
 
				+from proxypool.schemas.proxy import Proxy
			
 
				+from proxypool.crawlers.base import BaseCrawler
			
 
				+import json
			
 
				+
			
 
				+BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc'
			
 
				+MAX_PAGE = 18
			
 
				+
			
 
				+
			
 
				+class GeonodeCrawler(BaseCrawler):
			
 
				+    """
			
 
				+    Geonode crawler, https://proxylist.geonode.com/
			
 
				+    """
			
 
				+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
			
 
				+
			
 
				+    def parse(self, html):
			
 
				+        """
			
 
				+        parse html file to get proxies
			
 
				+        :return:
			
 
				+        """
			
 
				+        try:
			
 
				+            result = json.loads(html)
			
 
				+            proxy_list = result['data']
			
 
				+            for proxy_item in proxy_list:
			
 
				+                host = proxy_item['ip']
			
 
				+                port = proxy_item['port']
			
 
				+                yield Proxy(host=host, port=port)
			
 
				+        except json.JSONDecodeError:
			
 
				+            print("json.JSONDecodeError")
			
 
				+            return
			
 
				+
			
 
				+    def crawl(self):
			
 
				+        """
			
 
				+        override crawl main method
			
 
				+        add headers
			
 
				+        """
			
 
				+        headers = {
			
 
				+            'authority': 'proxylist.geonode.com',
			
 
				+            'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
			
 
				+            'accept': 'application/json, text/plain, */*',
			
 
				+            'sec-ch-ua-mobile': '?0',
			
 
				+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
			
 
				+            'sec-ch-ua-platform': '"macOS"',
			
 
				+            'origin': 'https://geonode.com',
			
 
				+            'sec-fetch-site': 'same-site',
			
 
				+            'sec-fetch-mode': 'cors',
			
 
				+            'sec-fetch-dest': 'empty',
			
 
				+            'referer': 'https://geonode.com/',
			
 
				+            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
			
 
				+            'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"',
			
 
				+        }
			
 
				+        try:
			
 
				+            for url in self.urls:
			
 
				+                logger.info(f'fetching {url}')
			
 
				+                html = self.fetch(url, headers=headers)
			
 
				+                if not html:
			
 
				+                    continue
			
 
				+                time.sleep(.5)
			
 
				+                yield from self.process(html, url)
			
 
				+        except RetryError:
			
 
				+            logger.error(
			
 
				+                f'crawler {self} crawled proxy unsuccessfully, '
			
 
				+                'please check if target url is valid or network issue')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler = GeonodeCrawler()
			
 
				+    for proxy in crawler.crawl():
			
 
				+        print(proxy)