Germey 5 years ago
parent
commit
39bcf6b48f

+ 0 - 29
examples/example.py

@@ -1,29 +0,0 @@
-import os
-import sys
-import requests
-from bs4 import BeautifulSoup
-
-dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, dir)
-
-
-def get_proxy():
-    r = requests.get('http://127.0.0.1:5555/random')
-    proxy = BeautifulSoup(r.text, "lxml").get_text()
-    return proxy
-
-
-def crawl(url, proxy):
-    proxies = {'http': proxy}
-    r = requests.get(url, proxies=proxies)
-    return r.text
-
-
-def main():
-    proxy = get_proxy()
-    html = crawl('http://docs.jinkan.org/docs/flask/', proxy)
-    print(html)
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 15
examples/proxytest.py

@@ -1,15 +0,0 @@
-import requests
-from proxypool.setting import TEST_URL
-
-proxy = '96.9.90.90:8080'
-
-proxies = {
-    'http': 'http://' + proxy,
-    'https': 'https://' + proxy,
-}
-
-print(TEST_URL)
-response = requests.get(TEST_URL, proxies=proxies, verify=False)
-if response.status_code == 200:
-    print('Successfully')
-    print(response.text)

+ 8 - 9
proxypool/api.py

@@ -1,42 +1,41 @@
 from flask import Flask, g
 from flask import Flask, g
-
 from .db import RedisClient
 from .db import RedisClient
+from .setting import API_HOST, API_PORT
 
 
 __all__ = ['app']
 __all__ = ['app']
 
 
 app = Flask(__name__)
 app = Flask(__name__)
 
 
-
 def get_conn():
 def get_conn():
+    """
+    get redis client object
+    :return:
+    """
     if not hasattr(g, 'redis'):
     if not hasattr(g, 'redis'):
         g.redis = RedisClient()
         g.redis = RedisClient()
     return g.redis
     return g.redis
 
 
-
 @app.route('/')
 @app.route('/')
 def index():
 def index():
     return '<h2>Welcome to Proxy Pool System</h2>'
     return '<h2>Welcome to Proxy Pool System</h2>'
 
 
-
 @app.route('/random')
 @app.route('/random')
 def get_proxy():
 def get_proxy():
     """
     """
-    Get a proxy
+    get a random proxy
     :return: 随机代理
     :return: 随机代理
     """
     """
     conn = get_conn()
     conn = get_conn()
     return conn.random()
     return conn.random()
 
 
-
 @app.route('/count')
 @app.route('/count')
 def get_counts():
 def get_counts():
     """
     """
-    Get the count of proxies
+    get the count of proxies
     :return: 代理池总量
     :return: 代理池总量
     """
     """
     conn = get_conn()
     conn = get_conn()
     return str(conn.count())
     return str(conn.count())
 
 
-
 if __name__ == '__main__':
 if __name__ == '__main__':
-    app.run()
+    app.run(host=API_HOST, port=API_PORT, threaded=True)

+ 0 - 147
proxypool/crawler.py

@@ -1,147 +0,0 @@
-import json
-import re
-from .utils import get_page
-from pyquery import PyQuery as pq
-
-
-class ProxyMetaclass(type):
-    def __new__(cls, name, bases, attrs):
-        count = 0
-        attrs['__CrawlFunc__'] = []
-        for k, v in attrs.items():
-            if 'crawl_' in k:
-                attrs['__CrawlFunc__'].append(k)
-                count += 1
-        attrs['__CrawlFuncCount__'] = count
-        return type.__new__(cls, name, bases, attrs)
-
-
-class Crawler(object, metaclass=ProxyMetaclass):
-    def get_proxies(self, callback):
-        proxies = []
-        for proxy in eval("self.{}()".format(callback)):
-            print('成功获取到代理', proxy)
-            proxies.append(proxy)
-        return proxies
-       
-    def crawl_daili66(self, page_count=4):
-        """
-        获取代理66
-        :param page_count: 页码
-        :return: 代理
-        """
-        start_url = 'http://www.66ip.cn/{}.html'
-        urls = [start_url.format(page) for page in range(1, page_count + 1)]
-        for url in urls:
-            print('Crawling', url)
-            html = get_page(url)
-            if html:
-                doc = pq(html)
-                trs = doc('.containerbox table tr:gt(0)').items()
-                for tr in trs:
-                    ip = tr.find('td:nth-child(1)').text()
-                    port = tr.find('td:nth-child(2)').text()
-                    yield ':'.join([ip, port])
-
-    def crawl_ip3366(self):
-        for page in range(1, 4):
-            start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
-            html = get_page(start_url)
-            ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
-            # \s * 匹配空格,起到换行作用
-            re_ip_address = ip_address.findall(html)
-            for address, port in re_ip_address:
-                result = address+':'+ port
-                yield result.replace(' ', '')
-    
-    def crawl_kuaidaili(self):
-        for i in range(1, 4):
-            start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
-            html = get_page(start_url)
-            if html:
-                ip_address = re.compile('<td data-title="IP">(.*?)</td>') 
-                re_ip_address = ip_address.findall(html)
-                port = re.compile('<td data-title="PORT">(.*?)</td>')
-                re_port = port.findall(html)
-                for address,port in zip(re_ip_address, re_port):
-                    address_port = address+':'+port
-                    yield address_port.replace(' ','')
-
-    def crawl_xicidaili(self):
-        for i in range(1, 3):
-            start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
-            headers = {
-                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-                'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
-                'Host':'www.xicidaili.com',
-                'Referer':'http://www.xicidaili.com/nn/3',
-                'Upgrade-Insecure-Requests':'1',
-            }
-            html = get_page(start_url, options=headers)
-            if html:
-                find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
-                trs = find_trs.findall(html)
-                for tr in trs:
-                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') 
-                    re_ip_address = find_ip.findall(tr)
-                    find_port = re.compile('<td>(\d+)</td>')
-                    re_port = find_port.findall(tr)
-                    for address,port in zip(re_ip_address, re_port):
-                        address_port = address+':'+port
-                        yield address_port.replace(' ','')
-    
-    def crawl_ip3366(self):
-        for i in range(1, 4):
-            start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
-            html = get_page(start_url)
-            if html:
-                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
-                trs = find_tr.findall(html)
-                for s in range(1, len(trs)):
-                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
-                    re_ip_address = find_ip.findall(trs[s])
-                    find_port = re.compile('<td>(\d+)</td>')
-                    re_port = find_port.findall(trs[s])
-                    for address,port in zip(re_ip_address, re_port):
-                        address_port = address+':'+port
-                        yield address_port.replace(' ','')
-    
-    def crawl_iphai(self):
-        start_url = 'http://www.iphai.com/'
-        html = get_page(start_url)
-        if html:
-            find_tr = re.compile('<tr>(.*?)</tr>', re.S)
-            trs = find_tr.findall(html)
-            for s in range(1, len(trs)):
-                find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
-                re_ip_address = find_ip.findall(trs[s])
-                find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
-                re_port = find_port.findall(trs[s])
-                for address,port in zip(re_ip_address, re_port):
-                    address_port = address+':'+port
-                    yield address_port.replace(' ','')
-
-    def crawl_data5u(self):
-        start_url = 'http://www.data5u.com/free/gngn/index.shtml'
-        headers = {
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Encoding': 'gzip, deflate',
-            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
-            'Cache-Control': 'max-age=0',
-            'Connection': 'keep-alive',
-            'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
-            'Host': 'www.data5u.com',
-            'Referer': 'http://www.data5u.com/free/index.shtml',
-            'Upgrade-Insecure-Requests': '1',
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
-        }
-        html = get_page(start_url, options=headers)
-        if html:
-            ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
-            re_ip_address = ip_address.findall(html)
-            for address, port in re_ip_address:
-                result = address + ':' + port
-                yield result.replace(' ', '')
-
-
-            

+ 10 - 0
proxypool/crawlers/__init__.py

@@ -0,0 +1,10 @@
+from .daili66 import Daili66Crawler
+from .ip3366 import IP3366Crawler
+from .iphai import IPHaiCrawler
+
+
+__all__ = [
+    Daili66Crawler,
+    IP3366Crawler,
+    IP3366Crawler
+]

+ 27 - 0
proxypool/crawlers/base.py

@@ -0,0 +1,27 @@
+from retrying import retry
+import requests
+from loguru import logger
+
+
+class BaseCrawler(object):
+    urls = []
+    
+    @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None)
+    def fetch(self, url, **kwargs):
+        try:
+            response = requests.get(url, **kwargs)
+            if response.status_code == 200:
+                return response.text
+        except requests.ConnectionError:
+            return
+        
+    @logger.catch
+    def crawl(self):
+        """
+        crawl main method
+        """
+        for url in self.urls:
+            logger.info(f'fetching {url}')
+            html = self.fetch(url)
+            for proxy in self.parse(html):
+                yield proxy

+ 31 - 0
proxypool/crawlers/daili66.py

@@ -0,0 +1,31 @@
+from pyquery import PyQuery as pq
+from proxypool.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+
+
+BASE_URL = 'http://www.636ip.cn/{page}.html'
+MAX_PAGE = 5
+
+class Daili66Crawler(BaseCrawler):
+    """
+    daili66 crawler, http://www.66ip.cn/1.html
+    """
+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('.containerbox table tr:gt(0)').items()
+        for tr in trs:
+            host = tr.find('td:nth-child(1)').text()
+            port = int(tr.find('td:nth-child(2)').text())
+            yield Proxy(host=host, port=port)
+
+if __name__ == '__main__':
+    crawler = Daili66Crawler()
+    for proxy in crawler.crawl():
+        print(proxy)

+ 31 - 0
proxypool/crawlers/ip3366.py

@@ -0,0 +1,31 @@
+from proxypool.crawlers.base import BaseCrawler
+from proxypool.proxy import Proxy
+import re
+
+
+MAX_PAGE = 5
+BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
+
+class IP3366Crawler(BaseCrawler):
+    """
+    ip3366 crawler, http://www.ip3366.net/
+    """
+    urls = [BASE_URL.format(page=i) for i in range(1, 8)]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
+        # \s * 匹配空格,起到换行作用
+        re_ip_address = ip_address.findall(html)
+        for address, port in re_ip_address:
+            proxy = Proxy(host=address.strip(), port=int(port.strip()))
+            yield proxy
+
+if __name__ == '__main__':
+    crawler = IP3366Crawler()
+    for proxy in crawler.crawl():
+        print(proxy)

+ 34 - 0
proxypool/crawlers/iphai.py

@@ -0,0 +1,34 @@
+from proxypool.crawlers.base import BaseCrawler
+from proxypool.proxy import Proxy
+import re
+
+
+BASE_URL = 'http://www.iphai.com/'
+
+class IPHaiCrawler(BaseCrawler):
+    """
+    iphai crawler, http://www.iphai.com/
+    """
+    urls = [BASE_URL]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        find_tr = re.compile('<tr>(.*?)</tr>', re.S)
+        trs = find_tr.findall(html)
+        for s in range(1, len(trs)):
+            find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
+            re_ip_address = find_ip.findall(trs[s])
+            find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
+            re_port = find_port.findall(trs[s])
+            for address, port in zip(re_ip_address, re_port):
+                proxy = Proxy(host=address.strip(), port=int(port.strip()))
+                yield proxy
+
+if __name__ == '__main__':
+    crawler = IPHaiCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)

+ 92 - 72
proxypool/db.py

@@ -1,111 +1,131 @@
 import redis
 import redis
 from proxypool.error import PoolEmptyError
 from proxypool.error import PoolEmptyError
+from proxypool.proxy import Proxy
 from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
 from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
-from proxypool.setting import MAX_SCORE, MIN_SCORE, INITIAL_SCORE
+from proxypool.setting import PROXY_SCORE_MAX, PROXY_SCORE_MIN, PROXY_SCORE_INIT
 from random import choice
 from random import choice
-import re
+from typing import List
+from loguru import logger
+
+from proxypool.utils import is_valid_proxy, convert_proxy_or_proxies
+
+
+REDIS_CLIENT_VERSION = redis.__version__
+IS_REDIS_VERSION_2 = REDIS_CLIENT_VERSION.startswith('2.')
 
 
 
 
 class RedisClient(object):
 class RedisClient(object):
-    def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
+    """
+    redis connection client of proxypool
+    """
+    
+    def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, **kwargs):
         """
         """
-        初始化
-        :param host: Redis 地址
-        :param port: Redis 端口
-        :param password: Redis密码
+        init redis client
+        :param host: redis host
+        :param port: redis port
+        :param password: redis password
         """
         """
-        self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)
+        self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True, **kwargs)
     
     
-    def add(self, proxy, score=INITIAL_SCORE):
+    def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
         """
         """
-        添加代理,设置分数为最高
-        :param proxy: 代理
-        :param score: 分数
-        :return: 添加结果
+        add proxy and set it to init score
+        :param proxy: proxy, ip:port, like 8.8.8.8:88
+        :param score: int score
+        :return: result
         """
         """
-        if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy):
-            print('代理不符合规范', proxy, '丢弃')
+        if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
+            logger.info(f'invalid proxy {proxy}, throw it')
             return
             return
         if not self.db.zscore(REDIS_KEY, proxy):
         if not self.db.zscore(REDIS_KEY, proxy):
-            #return self.db.zadd(REDIS_KEY, score, proxy)
-            #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码
-            return self.db.zadd(REDIS_KEY, {proxy:score})
+            if IS_REDIS_VERSION_2:
+                return self.db.zadd(REDIS_KEY, score, proxy)
+            return self.db.zadd(REDIS_KEY, {proxy: score})
     
     
-    def random(self):
-        """
-        随机获取有效代理,首先尝试获取最高分数代理,如果不存在,按照排名获取,否则异常
-        :return: 随机代理
-        """
-        result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
-        if len(result):
-            return choice(result)
-        else:
-            result = self.db.zrevrange(REDIS_KEY, 0, 100)
-            if len(result):
-                return choice(result)
-            else:
-                raise PoolEmptyError
+    def random(self) -> Proxy:
+        """
+        get random proxy
+        firstly try to get proxy with max score
+        if not exists, try to get proxy by rank
+        if not exists, raise error
+        :return: proxy, like 8.8.8.8:8
+        """
+        # try to get proxy with max score
+        proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
+        if len(proxies):
+            return convert_proxy_or_proxies(choice(proxies))
+        # else get proxy by rank
+        proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
+        if len(proxies):
+            return convert_proxy_or_proxies(choice(proxies))
+        # else raise error
+        raise PoolEmptyError
     
     
-    def decrease(self, proxy):
-        """
-        代理值减一分,小于最小值则删除
-        :param proxy: 代理
-        :return: 修改后的代理分数
-        """
-        score = self.db.zscore(REDIS_KEY, proxy)
-        if score and score > MIN_SCORE:
-            print('代理', proxy, '当前分数', score, '减1')
-            #return self.db.zincrby(REDIS_KEY, proxy, -1)
-            #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码
-            return self.db.zincrby(REDIS_KEY, -1, proxy)
+    def decrease(self, proxy: Proxy) -> int:
+        """
+        decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
+        :param proxy: proxy
+        :return: new score
+        """
+        score = self.db.zscore(REDIS_KEY, proxy.string())
+        # current score is larger than PROXY_SCORE_MIN
+        if score and score > PROXY_SCORE_MIN:
+            logger.info(f'{proxy.string()} current score {score}, decrease 1')
+            if IS_REDIS_VERSION_2:
+                return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
+            return self.db.zincrby(REDIS_KEY, -1, proxy.string())
+        # otherwise delete proxy
         else:
         else:
-            print('代理', proxy, '当前分数', score, '移除')
+            logger.info(f'{proxy.string()} current score {score}, remove')
             return self.db.zrem(REDIS_KEY, proxy)
             return self.db.zrem(REDIS_KEY, proxy)
     
     
-    def exists(self, proxy):
+    def exists(self, proxy: Proxy) -> bool:
         """
         """
-        判断是否存在
-        :param proxy: 代理
-        :return: 是否存在
+        if proxy exists
+        :param proxy: proxy
+        :return: if exists, bool
         """
         """
-        return not self.db.zscore(REDIS_KEY, proxy) == None
+        return not self.db.zscore(REDIS_KEY, proxy.string()) == None
     
     
-    def max(self, proxy):
+    def max(self, proxy: Proxy) -> int:
         """
         """
-        将代理设置为MAX_SCORE
-        :param proxy: 代理
-        :return: 设置结果
+        set proxy to max score
+        :param proxy: proxy
+        :return: new score
         """
         """
-        print('代理', proxy, '可用,设置为', MAX_SCORE)
-        #return self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)
-        #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码
-        return self.db.zadd(REDIS_KEY, {proxy:MAX_SCORE})
+        logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
+        if IS_REDIS_VERSION_2:
+            return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
+        return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
     
     
-    def count(self):
+    def count(self) -> int:
         """
         """
-        获取数量
-        :return: 数量
+        get count of proxies
+        :return: count, int
         """
         """
         return self.db.zcard(REDIS_KEY)
         return self.db.zcard(REDIS_KEY)
     
     
-    def all(self):
+    def all(self) -> List[Proxy]:
         """
         """
-        获取全部代理
-        :return: 全部代理列表
+        get all proxies
+        :return: list of proxies
         """
         """
-        return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
+        return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
     
     
-    def batch(self, start, stop):
+    def batch(self, start, end) -> List[Proxy]:
         """
         """
-        批量获取
-        :param start: 开始索引
-        :param stop: 结束索引
-        :return: 代理列表
+        get batch of proxies
+        :param start: start index
+        :param end: end index
+        :return: list of proxies
         """
         """
-        return self.db.zrevrange(REDIS_KEY, start, stop - 1)
+        return convert_proxy_or_proxies(self.db.zrevrange(REDIS_KEY, start, end - 1))
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     conn = RedisClient()
     conn = RedisClient()
-    result = conn.batch(680, 688)
+    result = conn.batch(1, 10)
     print(result)
     print(result)
+    result = conn.random()
+    print(result, type(result))

+ 5 - 5
proxypool/error.py

@@ -1,7 +1,7 @@
 class PoolEmptyError(Exception):
 class PoolEmptyError(Exception):
-
-    def __init__(self):
-        Exception.__init__(self)
-
     def __str__(self):
     def __str__(self):
-        return repr('代理池已经枯竭')
+        """
+        proxypool is used out
+        :return:
+        """
+        return repr('no proxy in proxypool')

+ 34 - 20
proxypool/getter.py

@@ -1,30 +1,44 @@
-from proxypool.tester import Tester
 from proxypool.db import RedisClient
 from proxypool.db import RedisClient
-from proxypool.crawler import Crawler
-from proxypool.setting import *
-import sys
+from proxypool.setting import PROXY_NUMBER_MAX
+from proxypool.crawlers import __all__ as crawlers_cls
+
 
 
 class Getter():
 class Getter():
+    """
+    getter of proxypool
+    """
+    
+    
     def __init__(self):
     def __init__(self):
+        """
+        init db and crawlers
+        """
         self.redis = RedisClient()
         self.redis = RedisClient()
-        self.crawler = Crawler()
+        self.crawlers_cls = crawlers_cls
+        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
+    
     
     
-    def is_over_threshold(self):
+    def is_full(self):
         """
         """
-        判断是否达到了代理池限制
+        if proxypool if full
+        return: bool
         """
         """
-        if self.redis.count() >= POOL_UPPER_THRESHOLD:
-            return True
-        else:
-            return False
+        return self.redis.count() >= PROXY_NUMBER_MAX
+    
     
     
     def run(self):
     def run(self):
-        print('获取器开始执行')
-        if not self.is_over_threshold():
-            for callback_label in range(self.crawler.__CrawlFuncCount__):
-                callback = self.crawler.__CrawlFunc__[callback_label]
-                # 获取代理
-                proxies = self.crawler.get_proxies(callback)
-                sys.stdout.flush()
-                for proxy in proxies:
-                    self.redis.add(proxy)
+        """
+        run crawlers to get proxy
+        :return:
+        """
+        if self.is_full():
+            return
+        for crawler in self.crawlers:
+            for proxy in crawler.crawl():
+                print('proxy', proxy)
+                self.redis.add(proxy)
+
+
+if __name__ == '__main__':
+    getter = Getter()
+    getter.run()

+ 0 - 22
proxypool/importer.py

@@ -1,22 +0,0 @@
-from proxypool.db import RedisClient
-
-conn = RedisClient()
-
-
-def set(proxy):
-    result = conn.add(proxy)
-    print(proxy)
-    print('录入成功' if result else '录入失败')
-
-
-def scan():
-    print('请输入代理, 输入exit退出读入')
-    while True:
-        proxy = input()
-        if proxy == 'exit':
-            break
-        set(proxy)
-
-
-if __name__ == '__main__':
-    scan()

+ 22 - 0
proxypool/proxy.py

@@ -0,0 +1,22 @@
+from attr import attrs, attr
+
+
+@attrs
+class Proxy(object):
+    """
+    proxy schema
+    """
+    host = attr(type=str, default=None)
+    port = attr(type=int, default=None)
+    
+    def __str__(self):
+        return f'{self.host}:{self.port}'
+    
+    def string(self):
+        return self.__str__()
+
+
+if __name__ == '__main__':
+    proxy = Proxy(host='8.8.8.8', port=8888)
+    print('proxy', proxy)
+    print('proxy', proxy.string())

+ 62 - 40
proxypool/setting.py

@@ -1,40 +1,62 @@
-# Redis数据库地址
-REDIS_HOST = '127.0.0.1'
-
-# Redis端口
-REDIS_PORT = 6379
-
-# Redis密码,如无填None
-REDIS_PASSWORD = None
-
-REDIS_KEY = 'proxies'
-
-# 代理分数
-MAX_SCORE = 100
-MIN_SCORE = 0
-INITIAL_SCORE = 10
-
-VALID_STATUS_CODES = [200, 302]
-
-# 代理池数量界限
-POOL_UPPER_THRESHOLD = 50000
-
-# 检查周期
-TESTER_CYCLE = 20
-# 获取周期
-GETTER_CYCLE = 300
-
-# 测试API,建议抓哪个网站测哪个
-TEST_URL = 'http://www.baidu.com'
-
-# API配置
-API_HOST = '0.0.0.0'
-API_PORT = 5555
-
-# 开关
-TESTER_ENABLED = True
-GETTER_ENABLED = True
-API_ENABLED = True
-
-# 最大批测试量
-BATCH_TEST_SIZE = 10
+from environs import Env
+from proxypool.utils import parse_redis_connection_string
+
+
+env = Env()
+env.read_env()
+
+# definition of environments
+DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod'
+APP_ENV = env.str('APP_ENV', DEV_MODE).lower()
+APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False)
+APP_DEV = IS_DEV = APP_ENV == DEV_MODE
+APP_PROD = IS_PROD = APP_DEV == PROD_MODE
+APP_TEST = IS_TEST = APP_ENV = TEST_MODE
+
+# redis host
+REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1')
+# redis port
+REDIS_PORT = env.int('REDIS_PORT', 6379)
+# redis password, if no password, set it to None
+REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
+# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port
+# REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None)
+
+# if REDIS_CONNECTION_STRING:
+#     REDIS_HOST, REDIS_PORT, REDIS_PASSWORD = parse_redis_connection_string(REDIS_CONNECTION_STRING)
+
+# redis hash table key name
+REDIS_KEY = env.str('REDIS_HOST', 'proxies')
+
+# definition of proxy scores
+PROXY_SCORE_MAX = 100
+PROXY_SCORE_MIN = 0
+PROXY_SCORE_INIT = 10
+
+# definition of proxy number
+PROXY_NUMBER_MAX = 50000
+PROXY_NUMBER_MIN = 0
+
+# definition of tester cycle, it will test every CYCLE_TESTER second
+CYCLE_TESTER = env.int('CYCLE_TESTER', 20)
+# definition of getter cycle, it will get proxy every CYCLE_GETTER second
+CYCLE_GETTER = env.int('CYCLE_GETTER', 100)
+
+# definition of tester
+TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
+TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10)
+TEST_BATCH = env.int('TEST_BATCH', 20)
+# TEST_HEADERS = env.json('TEST_HEADERS', {
+#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
+# })
+TEST_VALID_STATUS = env.list('TEST_VALID_STATUS', [200, 206, 302])
+
+# definition of api
+API_HOST = env.str('API_HOST', '0.0.0.0')
+API_PORT = env.int('API_PORT', 5555)
+API_THREADED = env.bool('API_THREADED', True)
+
+# flags of enable
+ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
+ENABLE_GETTER = env.bool('ENABLE_GETTER', True)
+ENABLE_API = env.bool('ENABLE_API', True)

+ 41 - 18
proxypool/utils.py

@@ -1,27 +1,50 @@
 import requests
 import requests
 from requests.exceptions import ConnectionError
 from requests.exceptions import ConnectionError
+import re
 
 
-base_headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
-    'Accept-Encoding': 'gzip, deflate, sdch',
-    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
-}
+from proxypool.proxy import Proxy
 
 
 
 
-def get_page(url, options={}):
+def parse_redis_connection_string(connection_string):
     """
     """
-    抓取代理
-    :param url:
-    :param options:
+    parse a redis connection string, for example:
+    redis://[password]@host:port
+    rediss://[password]@host:port
+    :param connection_string:
     :return:
     :return:
     """
     """
-    headers = dict(base_headers, **options)
-    print('正在抓取', url)
-    try:
-        response = requests.get(url, headers=headers)
-        print('抓取成功', url, response.status_code)
-        if response.status_code == 200:
-            return response.text
-    except ConnectionError:
-        print('抓取失败', url)
+    result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)', connection_string)
+    return result.group(2), int(result.group(3)), (result.group(1) or None) if result \
+        else ('localhost', 6379, None)
+
+
+def is_valid_proxy(data):
+    """
+    is data is valid proxy format
+    :param data:
+    :return:
+    """
+    return re.match('\d+\.\d+\.\d+\.\d+\:\d+', data)
+
+
+def convert_proxy_or_proxies(data):
+    """
+    convert list of str to valid proxies or proxy
+    :param data:
+    :return:
+    """
+    print(data)
+    if not data:
         return None
         return None
+    if isinstance(data, list):
+        result = []
+        for item in data:
+            # skip invalid item
+            item = item.strip()
+            if not is_valid_proxy(item): continue
+            host, port = item.split(':')
+            result.append(Proxy(host=host, port=int(port)))
+        return result
+    if isinstance(data, str) and is_valid_proxy(data):
+        host, port = data.split(':')
+        return Proxy(host=host, port=int(port))