4 年之前 · 4878bf5ae3
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,7 @@
 
				 FROM python:3.6
			
 
				 WORKDIR /app
			
 
				 COPY . .
			
 
				-RUN pip install -r requirements.txt
			
 
				+# RUN pip install -r requirements.txt  -i https://pypi.douban.com/simple
			
 
				+RUN pip install -r requirements.txt  -i
			
 
				 VOLUME ["/app/proxypool/crawlers/private"]
			
 
				 CMD ["supervisord", "-c", "supervisord.conf"]
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,7 +6,7 @@ services:
 
				     command: redis-server
			
 
				     ports:
			
 
				       - "6379:6379"
			
 
				-  #    restart: always
			
 
				+    # restart: always
			
 
				   proxypool:
			
 
				     build: .
			
 
				     image: 'germey/proxypool'
			
--- a/examples/usage2.py
+++ b/examples/usage2.py
@@ -0,0 +1,95 @@
 
				+# -*- coding: UTF-8 -*-
			
 
				+
			
 
				+'''
			
 
				+'''
			
 
				+import requests
			
 
				+import time
			
 
				+import threading
			
 
				+import urllib3
			
 
				+from fake_headers import Headers
			
 
				+import uuid
			
 
				+from geolite2 import geolite2
			
 
				+ips = []
			
 
				+
			
 
				+# 爬数据的线程类
			
 
				+
			
 
				+def getChinaIP(ip='127.0.0.1'):
			
 
				+    reader = geolite2.reader()
			
 
				+    ip_info = reader.get(ip)
			
 
				+    geolite2.close()
			
 
				+    print(ip_info)
			
 
				+    return True if ip_info['country']['iso_code'] == 'CN' else False
			
 
				+
			
 
				+
			
 
				+
			
 
				+class CrawlThread(threading.Thread):
			
 
				+    def __init__(self, proxyip):
			
 
				+        super(CrawlThread, self).__init__()
			
 
				+        self.proxyip = proxyip
			
 
				+
			
 
				+    def run(self):
			
 
				+        # 开始计时
			
 
				+        pure_ip_address = self.proxyip.split(':')[0]
			
 
				+        # 验证IP归属
			
 
				+        if not getChinaIP(pure_ip_address):
			
 
				+            # pass
			
 
				+            raise ValueError('不是有效IP')
			
 
				+        # 
			
 
				+        start = time.time()
			
 
				+        # 消除关闭证书验证的警告
			
 
				+        urllib3.disable_warnings()
			
 
				+        headers = Headers(headers=True).generate()
			
 
				+        headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
			
 
				+        headers['Pragma'] = 'no-cache'
			
 
				+        headers['Host'] = 'bb.cf08tp.cn'
			
 
				+        headers['x-forward-for'] = pure_ip_address
			
 
				+        headers['Cookie'] = 'PHPSESSID={}'.format(
			
 
				+            ''.join(str(uuid.uuid1()).split('-')))
			
 
				+        print(headers)
			
 
				+        html = requests.get(headers=headers, url=targetUrl, proxies={
			
 
				+                            "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
			
 
				+        # 结束计时
			
 
				+        end = time.time()
			
 
				+        # 输出内容
			
 
				+        print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
			
 
				+              "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")
			
 
				+
			
 
				+# 获取代理IP的线程类
			
 
				+
			
 
				+
			
 
				+class GetIpThread(threading.Thread):
			
 
				+    def __init__(self, fetchSecond):
			
 
				+        super(GetIpThread, self).__init__()
			
 
				+        self.fetchSecond = fetchSecond
			
 
				+
			
 
				+    def run(self):
			
 
				+        global ips
			
 
				+        while True:
			
 
				+            # 获取IP列表
			
 
				+            res = requests.get(apiUrl).content.decode()
			
 
				+            # 按照\n分割获取到的IP
			
 
				+            ips = res.split('\n')
			
 
				+            # 利用每一个IP
			
 
				+            for proxyip in ips:
			
 
				+                if proxyip.strip():
			
 
				+                    # 开启一个线程
			
 
				+                    # CrawlThread(proxyip).start()
			
 
				+                    try:
			
 
				+                        CrawlThread(proxyip).run()
			
 
				+                        time.sleep(1.5)
			
 
				+                    except Exception as e:
			
 
				+                        print(e)
			
 
				+            # 休眠
			
 
				+            time.sleep(len(ips) /self.fetchSecond )
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 获取IP的API接口
			
 
				+    # apiUrl = "http://127.0.0.1:5555/all"
			
 
				+    apiUrl = "http://127.0.0.1:5555/random"
			
 
				+    # 要抓取的目标网站地址
			
 
				+    targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
			
 
				+    # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
			
 
				+    fetchSecond = 5
			
 
				+    # 开始自动获取IP
			
 
				+    GetIpThread(fetchSecond).start()
			
--- a/proxypool/crawlers/base.py
+++ b/proxypool/crawlers/base.py
@@ -2,17 +2,19 @@ from retrying import retry
 
				 import requests
			
 
				 from loguru import logger
			
 
				 from proxypool.setting import GET_TIMEOUT
			
 
				-
			
 
				-
			
 
				+from fake_headers import Headers
			
 
				+import time
			
 
				 class BaseCrawler(object):
			
 
				     urls = []
			
 
				     
			
 
				     @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
			
 
				     def fetch(self, url, **kwargs):
			
 
				         try:
			
 
				+            headers = Headers(headers=True).generate()
			
 
				             kwargs.setdefault('timeout', GET_TIMEOUT)
			
 
				             kwargs.setdefault('verify', False)
			
 
				-            response = requests.get(url, **kwargs)
			
 
				+            kwargs.setdefault('headers', headers)
			
 
				+            response = requests.get(url ,**kwargs)
			
 
				             if response.status_code == 200:
			
 
				                 response.encoding = 'utf-8'
			
 
				                 return response.text
			
@@ -27,6 +29,7 @@ class BaseCrawler(object):
 
				         for url in self.urls:
			
 
				             logger.info(f'fetching {url}')
			
 
				             html = self.fetch(url)
			
 
				+            time.sleep(.5)
			
 
				             for proxy in self.parse(html):
			
 
				                 logger.info(f'fetched proxy {proxy.string()} from {url}')
			
 
				                 yield proxy
			
--- a/proxypool/crawlers/public/daili66.py
+++ b/proxypool/crawlers/public/daili66.py
@@ -4,7 +4,7 @@ from proxypool.crawlers.base import BaseCrawler
 
				 
			
 
				 
			
 
				 BASE_URL = 'http://www.66ip.cn/{page}.html'
			
 
				-MAX_PAGE = 5
			
 
				+MAX_PAGE = 50
			
 
				 
			
 
				 
			
 
				 class Daili66Crawler(BaseCrawler):
			
--- a/proxypool/crawlers/public/fatezero_proxylist.py
+++ b/proxypool/crawlers/public/fatezero_proxylist.py
@@ -19,13 +19,12 @@ class FatezeroCrawler(BaseCrawler):
 
				         
			
 
				         hosts_ports = html.split('\n')
			
 
				         for addr in hosts_ports:
			
 
				-            ip_address = json.loads(addr)
			
 
				-            if(True):
			
 
				+            if(addr):
			
 
				+                ip_address = json.loads(addr)
			
 
				                 host = ip_address['host']
			
 
				                 port = ip_address['port']
			
 
				                 yield Proxy(host=host, port=port)
			
 
				 
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				     crawler = FatezeroCrawler()
			
 
				     for proxy in crawler.crawl():
			
--- a/proxypool/crawlers/public/goubanjia.py
+++ b/proxypool/crawlers/public/goubanjia.py
@@ -0,0 +1,44 @@
 
				+from proxypool.schemas.proxy import Proxy
			
 
				+from proxypool.crawlers.base import BaseCrawler
			
 
				+import re
			
 
				+from pyquery import PyQuery as pq
			
 
				+import time
			
 
				+BASE_URL = 'http://www.goubanjia.com/'
			
 
				+
			
 
				+
			
 
				+class GoubanjiaCrawler(BaseCrawler):
			
 
				+    """
			
 
				+    ip  Goubanjia crawler, http://www.goubanjia.com/
			
 
				+    """
			
 
				+    urls = [BASE_URL]
			
 
				+    
			
 
				+    def parse(self, html):
			
 
				+        """
			
 
				+        parse html file to get proxies
			
 
				+        :return:
			
 
				+        """
			
 
				+        doc = pq(html)('.ip').items()
			
 
				+        # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))])
			
 
				+        for td in doc:
			
 
				+            trs = td.children()
			
 
				+            ip_str = ''
			
 
				+            for tr in trs:
			
 
				+                attrib = tr.attrib
			
 
				+                if 'style' in attrib and 'none' in  tr.attrib['style']:
			
 
				+                    continue
			
 
				+                ip_str+= '' if not tr.text else tr.text
			
 
				+            addr_split = ip_str.split(':')
			
 
				+            if(len(addr_split) == 2):
			
 
				+                host = addr_split[0]
			
 
				+                port = addr_split[1]
			
 
				+                yield Proxy(host=host, port=port)
			
 
				+            else:
			
 
				+                port = trs[-1].text
			
 
				+                host = ip_str.replace(port,'')
			
 
				+                yield Proxy(host=host, port=port)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler = GoubanjiaCrawler()
			
 
				+    for proxy in crawler.crawl():
			
 
				+        print(proxy)
			
--- a/proxypool/crawlers/public/ihuan.py
+++ b/proxypool/crawlers/public/ihuan.py
@@ -10,8 +10,9 @@ class IhuanCrawler(BaseCrawler):
 
				     """
			
 
				     ip  ihuan crawler, https://ip.ihuan.me
			
 
				     """
			
 
				-    urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]
			
 
				-    
			
 
				+    path = time.strftime("%Y/%m/%d/%H", time.localtime())
			
 
				+    urls = [BASE_URL.format(path=path)]
			
 
				+    ignore = False
			
 
				     def parse(self, html):
			
 
				         """
			
 
				         parse html file to get proxies
			
--- a/proxypool/crawlers/public/ip3366.py
+++ b/proxypool/crawlers/public/ip3366.py
@@ -3,15 +3,15 @@ from proxypool.schemas.proxy import Proxy
 
				 import re
			
 
				 
			
 
				 
			
 
				-MAX_PAGE = 5
			
 
				-BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
			
 
				+MAX_PAGE = 8
			
 
				+BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}'
			
 
				 
			
 
				 
			
 
				 class IP3366Crawler(BaseCrawler):
			
 
				     """
			
 
				     ip3366 crawler, http://www.ip3366.net/
			
 
				     """
			
 
				-    urls = [BASE_URL.format(page=i) for i in range(1, 8)]
			
 
				+    urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)]
			
 
				     
			
 
				     def parse(self, html):
			
 
				         """
			
--- a/proxypool/crawlers/public/jiangxianli.py
+++ b/proxypool/crawlers/public/jiangxianli.py
@@ -0,0 +1,35 @@
 
				+from proxypool.schemas.proxy import Proxy
			
 
				+from proxypool.crawlers.base import BaseCrawler
			
 
				+import re
			
 
				+import json
			
 
				+BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
			
 
				+
			
 
				+MAX_PAGE = 10
			
 
				+class JiangxianliCrawler(BaseCrawler):
			
 
				+    """
			
 
				+    jiangxianli crawler,https://ip.jiangxianli.com/
			
 
				+    """
			
 
				+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
			
 
				+    
			
 
				+    def parse(self, html):
			
 
				+        """
			
 
				+        parse html file to get proxies
			
 
				+        :return:
			
 
				+        """
			
 
				+        
			
 
				+        result =json.loads(html)
			
 
				+        if result['code'] != 0:
			
 
				+            return
			
 
				+        MAX_PAGE = int(result['data']['last_page'])
			
 
				+        hosts_ports = result['data']['data']
			
 
				+        for ip_address in hosts_ports:
			
 
				+            if(ip_address):
			
 
				+                host = ip_address['ip']
			
 
				+                port = ip_address['port']
			
 
				+                yield Proxy(host=host, port=port)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler = JiangxianliCrawler()
			
 
				+    for proxy in crawler.crawl():
			
 
				+        print(proxy)
			
--- a/proxypool/crawlers/public/kuaidaili.py
+++ b/proxypool/crawlers/public/kuaidaili.py
@@ -4,15 +4,15 @@ import re
 
				 from pyquery import PyQuery as pq
			
 
				 
			
 
				 
			
 
				-BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
			
 
				-MAX_PAGE = 5
			
 
				+BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/'
			
 
				+MAX_PAGE = 300
			
 
				 
			
 
				 
			
 
				 class KuaidailiCrawler(BaseCrawler):
			
 
				     """
			
 
				     kuaidaili crawler, https://www.kuaidaili.com/
			
 
				     """
			
 
				-    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
			
 
				+    urls = [BASE_URL.format(type=type,page=page)  for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)]
			
 
				     
			
 
				     def parse(self, html):
			
 
				         """
			
--- a/proxypool/crawlers/public/zhandaye.py
+++ b/proxypool/crawlers/public/zhandaye.py
@@ -6,7 +6,7 @@ import re
 
				 
			
 
				 
			
 
				 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
			
 
				-MAX_PAGE = 5
			
 
				+MAX_PAGE = 5 * 2
			
 
				 
			
 
				 class ZhandayeCrawler(BaseCrawler):
			
 
				     """
			
--- a/proxypool/processors/server.py
+++ b/proxypool/processors/server.py
@@ -37,6 +37,21 @@ def get_proxy():
 
				     return conn.random().string()
			
 
				 
			
 
				 
			
 
				[email protected]('/all')
			
 
				+def get_proxy_all():
			
 
				+    """
			
 
				+    get a random proxy
			
 
				+    :return: get a random proxy
			
 
				+    """
			
 
				+    conn = get_conn()
			
 
				+    proxies = conn.all()
			
 
				+    proxies_string = ''
			
 
				+    for proxy in proxies:
			
 
				+        proxies_string += str(proxy) + '\n'
			
 
				+
			
 
				+    return proxies_string
			
 
				+
			
 
				+
			
 
				 @app.route('/count')
			
 
				 def get_count():
			
 
				     """
			
--- a/proxypool/processors/tester.py
+++ b/proxypool/processors/tester.py
@@ -84,7 +84,14 @@ class Tester(object):
 
				             if not cursor:
			
 
				                 break
			
 
				 
			
 
				+def run_tester():
			
 
				+    host = '96.113.165.182'
			
 
				+    port = '3128'
			
 
				+    tasks = [tester.test(Proxy(host=host, port=port))]
			
 
				+    tester.loop.run_until_complete(asyncio.wait(tasks))
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     tester = Tester()
			
 
				     tester.run()
			
 
				+    # run_tester()
			
 
				+
			
--- a/proxypool/storages/redis.py
+++ b/proxypool/storages/redis.py
@@ -51,11 +51,11 @@ class RedisClient(object):
 
				         :return: proxy, like 8.8.8.8:8
			
 
				         """
			
 
				         # try to get proxy with max score
			
 
				-        proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
			
 
				+        proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX)
			
 
				         if len(proxies):
			
 
				             return convert_proxy_or_proxies(choice(proxies))
			
 
				         # else get proxy by rank
			
 
				-        proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
			
 
				+        proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX)
			
 
				         if len(proxies):
			
 
				             return convert_proxy_or_proxies(choice(proxies))
			
 
				         # else raise error
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,13 @@
 
				-environs==7.2.0
			
 
				-Flask==1.0.3
			
 
				-attrs==19.1.0
			
 
				+environs==9.3.0
			
 
				+Flask==1.1.2
			
 
				+attrs==20.3.0
			
 
				 retrying==1.3.3
			
 
				 aiohttp==3.7.4
			
 
				-requests==2.22.0
			
 
				-loguru==0.3.2
			
 
				-pyquery==1.4.0
			
 
				-supervisor==4.1.0
			
 
				-redis==2.10.6
			
 
				-lxml==4.6.2
			
 
				+requests==2.25.1
			
 
				+loguru==0.5.3
			
 
				+pyquery==1.4.3
			
 
				+supervisor==4.2.1
			
 
				+redis==3.5.3
			
 
				+lxml==4.6.2
			
 
				+fake_headers==1.0.2
			
 
				+maxminddb_geolite2==2018.703