Browse Source

change requirements

Germey 8 years ago
parent
commit
5ea405a147
8 changed files with 46 additions and 98 deletions
  1. 8 4
      proxypool/api.py
  2. 14 16
      proxypool/db.py
  3. 0 7
      proxypool/error.py
  4. 1 18
      proxypool/getter.py
  5. 10 37
      proxypool/schedule.py
  6. 1 1
      proxypool/setting.py
  7. 3 5
      requirements.txt
  8. 9 10
      setup.py

+ 8 - 4
proxypool/api.py

@@ -8,7 +8,8 @@ app = Flask(__name__)
 
 
 def get_conn():
-    """Opens a new redis connection if there is none yet for the
+    """
+    Opens a new redis connection if there is none yet for the
     current application context.
     """
     if not hasattr(g, 'redis_client'):
@@ -18,12 +19,13 @@ def get_conn():
 
 @app.route('/')
 def index():
-    return '<h1>Welcome</h1>'
+    return '<h2>Welcome to Proxy Pool System</h2>'
 
 
 @app.route('/get')
 def get_proxy():
-    """Get a proxy
+    """
+    Get a proxy
     """
     conn = get_conn()
     return conn.pop()
@@ -31,10 +33,12 @@ def get_proxy():
 
 @app.route('/count')
 def get_counts():
-    """Get the count of proxies
+    """
+    Get the count of proxies
     """
     conn = get_conn()
     return str(conn.queue_len)
 
+
 if __name__ == '__main__':
     app.run()

+ 14 - 16
proxypool/db.py

@@ -4,38 +4,34 @@ from proxypool.setting import HOST, PORT
 
 
 class RedisClient(object):
-    """
-    Redis数据库操作类。
-    """
-
     def __init__(self, host=HOST, port=PORT):
         self._db = redis.Redis(host, port)
 
     def get(self, count=1):
-        """从Pool中获取一定量数据。"""
+        """
+        get proxies from redis
+        """
         proxies = self._db.lrange("proxies", 0, count - 1)
         self._db.ltrim("proxies", count, -1)
         print(proxies)
         return proxies
 
     def put(self, proxy):
-        """将代理压入Pool中。
-        用Redis的set容器来负责去重,如果proxy能被压入proxy_set,
-        就将其放入proxy pool中,否则不压入。
         """
-        if self._db.sadd("set", proxy):
-            self._db.rpush("proxies", proxy)
-        else:
-            pass
+        add proxy to right top
+        """
+        self._db.rpush("proxies", proxy)
 
     def put_many(self, proxies):
-        """将一定量的代理压入Pool。
+        """
+        put many proxies to right
         """
         for proxy in proxies:
             self.put(proxy)
 
     def pop(self):
-        """弹出一个可用代理。
+        """
+        get proxy from right.
         """
         try:
             return self._db.rpop("proxies").decode('utf-8')
@@ -44,12 +40,14 @@ class RedisClient(object):
 
     @property
     def queue_len(self):
-        """获取proxy pool的大小。
+        """
+        get length from queue.
         """
         return self._db.llen("proxies")
 
     def flush(self):
-        """刷新Redis中的全部内容,测试用。
+        """
+        flush db
         """
         self._db.flushall()
 

+ 0 - 7
proxypool/error.py

@@ -1,8 +1,4 @@
 class ResourceDepletionError(Exception):
-    """
-    资源枯竭异常,如果从所有抓取网站都抓不到可用的代理资源,
-    则抛出此异常。
-    """
 
     def __init__(self):
         Exception.__init__(self)
@@ -12,9 +8,6 @@ class ResourceDepletionError(Exception):
 
 
 class PoolEmptyError(Exception):
-    """
-    代理池空异常,如果代理池长时间为空,则抛出此异常。
-    """
 
     def __init__(self):
         Exception.__init__(self)

+ 1 - 18
proxypool/getter.py

@@ -3,11 +3,6 @@ from pyquery import PyQuery as pq
 
 
 class ProxyMetaclass(type):
-    """
-    爬虫的元类,在FreeProxyGetter类中加入
-    __CrawlFunc__和__CrawlFuncCount__
-    两个参数,分别表示爬虫函数,和爬虫函数的数量。
-    """
 
     def __new__(cls, name, bases, attrs):
         count = 0
@@ -21,13 +16,7 @@ class ProxyMetaclass(type):
 
 
 class FreeProxyGetter(object, metaclass=ProxyMetaclass):
-    """
-    代理爬虫,负责扫描各大代理网站,抓取代理。
-    该类有可扩展性,可根据需要自己添加新站点的代理抓取函数,
-    但是函数名必须以crawl_开头,返回值必须以"host:port"的形式返回,
-    添加器会自动识别并调用此类函数。
-    """
-    
+
     def get_raw_proxies(self, callback):
         proxies = []
         print('Callback', callback)
@@ -37,9 +26,6 @@ class FreeProxyGetter(object, metaclass=ProxyMetaclass):
         return proxies
 
     def crawl_daili66(self, page_count=4):
-        """
-        抓取代理66网的数据。
-        """
         start_url = 'http://www.66ip.cn/{}.html'
         urls = [start_url.format(page) for page in range(1, page_count + 1)]
         for url in urls:
@@ -54,9 +40,6 @@ class FreeProxyGetter(object, metaclass=ProxyMetaclass):
                     yield ':'.join([ip, port])
 
     def crawl_proxy360(self):
-        """
-        抓取proxy360网的数据。
-        """
         start_url = 'http://www.proxy360.cn/Region/China'
         print('Crawling', start_url)
         html = get_page(start_url)

+ 10 - 37
proxypool/schedule.py

@@ -11,10 +11,6 @@ from asyncio import TimeoutError
 
 
 class ValidityTester(object):
-    """
-    检验器,负责对未知的代理进行异步检测。
-    """
-    # 用百度的首页来检验
     test_api = TEST_API
 
     def __init__(self):
@@ -22,15 +18,12 @@ class ValidityTester(object):
         self._usable_proxies = []
 
     def set_raw_proxies(self, proxies):
-        """
-        设置待检测的代理。
-        """
         self._raw_proxies = proxies
-        self._usable_proxies = []
+        self._conn = RedisClient()
 
     async def test_single_proxy(self, proxy):
         """
-        检测单个代理,如果可用,则将其加入_usable_proxies
+        text one proxy, if valid, put them to usable_proxies.
         """
         async with aiohttp.ClientSession() as session:
             try:
@@ -40,14 +33,14 @@ class ValidityTester(object):
                 print('Testing', proxy)
                 async with session.get(self.test_api, proxy=real_proxy, timeout=15) as response:
                     if response.status == 200:
-                        self._usable_proxies.append(proxy)
+                        self._conn.put(proxy)
                         print('Valid proxy', proxy)
-            except (ProxyConnectionError, TimeoutError):
+            except (ProxyConnectionError, TimeoutError, ValueError):
                 print('Invalid proxy', proxy)
 
     def test(self):
         """
-        异步检测_raw_proxies中的全部代理。
+        aio test all proxies.
         """
         print('ValidityTester is working')
         try:
@@ -57,13 +50,10 @@ class ValidityTester(object):
         except ValueError:
             print('Async Error')
 
-    def get_usable_proxies(self):
-        return self._usable_proxies
-
 
 class PoolAdder(object):
     """
-    添加器,负责向池中补充代理
+    add proxy to pool
     """
 
     def __init__(self, threshold):
@@ -74,7 +64,7 @@ class PoolAdder(object):
 
     def is_over_threshold(self):
         """
-        判断代理池中的数据量是否达到阈值。
+        judge if count is overflow.
         """
         if self._conn.queue_len >= self._threshold:
             return True
@@ -82,20 +72,15 @@ class PoolAdder(object):
             return False
 
     def add_to_queue(self):
-        """
-        命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理
-        加入到代理池中。
-        """
         print('PoolAdder is working')
         proxy_count = 0
         while not self.is_over_threshold():
             for callback_label in range(self._crawler.__CrawlFuncCount__):
                 callback = self._crawler.__CrawlFunc__[callback_label]
                 raw_proxies = self._crawler.get_raw_proxies(callback)
+                # test crawled proxies
                 self._tester.set_raw_proxies(raw_proxies)
                 self._tester.test()
-                proxies = self._tester.get_usable_proxies()
-                self._conn.put_many(proxies)
                 proxy_count += len(raw_proxies)
                 if self.is_over_threshold():
                     print('IP is enough, waiting to be used')
@@ -105,16 +90,10 @@ class PoolAdder(object):
 
 
 class Schedule(object):
-    """
-    总调度器,用于协调各调度器模块
-    """
-
     @staticmethod
     def valid_proxy(cycle=VALID_CHECK_CYCLE):
         """
-        对已经如池的代理进行检测,防止池中的代理因长期
-        不使用而过期。
-        抽出代理池队列中前1/2的代理,检测,合格者压入队列尾。
+        Get half of proxies which in redis
         """
         conn = RedisClient()
         tester = ValidityTester()
@@ -128,8 +107,6 @@ class Schedule(object):
             raw_proxies = conn.get(count)
             tester.set_raw_proxies(raw_proxies)
             tester.test()
-            proxies = tester.get_usable_proxies()
-            conn.put_many(proxies)
             time.sleep(cycle)
 
     @staticmethod
@@ -137,8 +114,7 @@ class Schedule(object):
                    upper_threshold=POOL_UPPER_THRESHOLD,
                    cycle=POOL_LEN_CHECK_CYCLE):
         """
-        协调添加器,当代理池中可用代理的数量低于下阈值时,触发添加器,启动爬虫
-        补充代理,当代理达到上阈值时,添加器停止工作。
+        If the number of proxies less than lower_threshold, add proxy
         """
         conn = RedisClient()
         adder = PoolAdder(upper_threshold)
@@ -148,9 +124,6 @@ class Schedule(object):
             time.sleep(cycle)
 
     def run(self):
-        """
-        运行调度器,创建两个进程,对代理池进行维护。
-        """
         print('Ip processing running')
         valid_process = Process(target=Schedule.valid_proxy)
         check_process = Process(target=Schedule.check_pool)

+ 1 - 1
proxypool/setting.py

@@ -4,7 +4,7 @@ HOST = 'localhost'
 PORT = 6379
 
 
-POOL_LOWER_THRESHOLD = 2
+POOL_LOWER_THRESHOLD = 10
 POOL_UPPER_THRESHOLD = 100
 
 VALID_CHECK_CYCLE = 60

+ 3 - 5
requirements.txt

@@ -1,7 +1,5 @@
-aiohttp==1.0.2
-bs4==0.0.1
+aiohttp==1.3.3
 Flask==0.11.1
-lxml==3.6.0
 redis==2.10.5
-requests==2.10.0
-
+requests==2.13.0
+pyquery==1.2.17

+ 9 - 10
setup.py

@@ -1,15 +1,15 @@
 from setuptools import setup
 
 setup(
-    name='proxypool',
+    name='proxy-pool',
     version='1.0.0',
-    description='High-performance cross-platform proxy pool',
-    long_description='Please go to https://github.com/WiseDoge/ProxyPool',
-    author='wisedoge',
-    author_email='wisedoge@outlook.com',
-    url='https://github.com/WiseDoge/ProxyPool',
+    description='High performance proxy pool',
+    long_description='A proxy pool project modified from WiseDoge/ProxyPool',
+    author='Germey',
+    author_email='cqc@cuiqingcai.com',
+    url='https://github.com/Germey/ProxyPool',
     packages=[
-        'proxypool'
+        'proxy-pool'
     ],
     py_modules=['run'],
     include_package_data=True,
@@ -17,13 +17,12 @@ setup(
     install_requires=[
         'aiohttp',
         'requests',
-        'bs4',
         'flask',
         'redis',
-        'lxml'
+        'pyquery'
     ],
     entry_points={
-        'console_scripts': ['proxypool_run=run:cli']
+        'console_scripts': ['proxy_pool_run=run:cli']
     },
     license='apache 2.0',
     zip_safe=False,