Browse Source

添加两个代理 (#203)

Shayne Wang 2 years ago
parent
commit
2344ee1db0
2 changed files with 87 additions and 0 deletions
  1. 38 0
      proxypool/crawlers/public/docip.py
  2. 49 0
      proxypool/crawlers/public/uqidata.py

+ 38 - 0
proxypool/crawlers/public/docip.py

@@ -0,0 +1,38 @@
+import time
+from retrying import RetryError
+from loguru import logger
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import json
+
+BASE_URL = 'https://www.docip.net/data/free.json?t={date}'
+
+
+
+class DocipCrawler(BaseCrawler):
+    """
+    Docip crawler, https://www.docip.net/data/free.json
+    """
+    urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        try:
+            result = json.loads(html)
+            proxy_list = result['data']
+            for proxy_item in proxy_list:
+                host = proxy_item['ip']
+                port = proxy_item['port']
+                yield Proxy(host=host, port=port)
+        except json.JSONDecodeError:
+            print("json.JSONDecodeError")
+            return
+
+
+if __name__ == '__main__':
+    crawler = DocipCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)

+ 49 - 0
proxypool/crawlers/public/uqidata.py

@@ -0,0 +1,49 @@
+from pyquery import PyQuery as pq
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+from loguru import logger
+
+BASE_URL = 'https://ip.uqidata.com/free/index.html'
+
+
+class UqidataCrawler(BaseCrawler):
+    """
+    Uqidata crawler, https://ip.uqidata.com/free/index.html
+    """
+    urls = [BASE_URL]
+    ignore = True
+
+    def encode(input_str):
+        tmp = []
+        for i in range(len(input_str)):
+            tmp.append("ABCDEFGHIZ".find(input_str[i]))
+        result = "".join(str(i) for i in tmp)
+        result = int(result) >> 0x03
+        return result
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items()
+        for tr in trs:
+            ip_html = tr('td.ip').find("*").items()
+            host = ''
+            for i in ip_html:
+                if i.attr('style') is not None and 'none' in i.attr('style'):
+                    continue
+                if i.text() == '':
+                    continue
+                host += i.text()
+
+            port_code = tr('td.port').attr('class').split(' ')[1]
+            port = UqidataCrawler.encode(port_code)
+            yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = UqidataCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)