Browse Source

update bug

Germey 8 years ago
parent
commit
e0c9d4bd8e
6 changed files with 57 additions and 58 deletions
  1. 1 1
      README.md
  2. 2 0
      proxypool/api.py
  3. 20 12
      proxypool/crawler.py
  4. 26 36
      proxypool/db.py
  5. 2 2
      proxypool/setting.py
  6. 6 7
      proxypool/tester.py

+ 1 - 1
README.md

@@ -37,7 +37,7 @@ python3 run.py
 
 利用requests获取方法如下
 
-```
+```python
 import requests
 
 PROXY_POOL_URL = 'http://localhost:5000/get'

+ 2 - 0
proxypool/api.py

@@ -22,6 +22,7 @@ def index():
 def get_proxy():
     """
     Get a proxy
+    :return: 随机代理
     """
     conn = get_conn()
     return conn.random()
@@ -31,6 +32,7 @@ def get_proxy():
 def get_counts():
     """
     Get the count of proxies
+    :return: 代理池总量
     """
     conn = get_conn()
     return str(conn.count())

+ 20 - 12
proxypool/crawler.py

@@ -23,29 +23,37 @@ class Crawler(object, metaclass=ProxyMetaclass):
             print('成功获取到代理', proxy)
             proxies.append(proxy)
         return proxies
-    
+
     def crawl_daxiang(self):
+        """
+        获取大象代理
+        :return: 代理
+        """
         url = 'http://vtp.daxiangdaili.com/ip/?tid=555546364094534&num=100'
         html = get_page(url)
         if html:
             urls = html.split('\n')
             for url in urls:
                 yield url
-    
+
     def crawl_kuaidaili(self):
+        """
+        获取快代理
+        :return: 代理
+        """
         url = 'http://dev.kuaidaili.com/api/getproxy/?orderid=959961765125099&num=100&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=1&an_an=1&an_ha=1&quality=1&format=json&sep=2'
         html = get_page(url)
         if html:
             result = json.loads(html)
-            ips = result.get('data').get('proxy_list')
-            for ip in ips:
-                yield ip
-    
+            proxies = result.get('data').get('proxy_list')
+            for proxy in proxies:
+                yield proxy
+
     def crawl_daili66(self, page_count=4):
         """
         获取代理66
-        :param page_count:
-        :return:
+        :param page_count: 页码
+        :return: 代理
         """
         start_url = 'http://www.66ip.cn/{}.html'
         urls = [start_url.format(page) for page in range(1, page_count + 1)]
@@ -59,11 +67,11 @@ class Crawler(object, metaclass=ProxyMetaclass):
                     ip = tr.find('td:nth-child(1)').text()
                     port = tr.find('td:nth-child(2)').text()
                     yield ':'.join([ip, port])
-    
+
     def crawl_proxy360(self):
         """
         获取Proxy360
-        :return:
+        :return: 代理
         """
         start_url = 'http://www.proxy360.cn/Region/China'
         print('Crawling', start_url)
@@ -75,11 +83,11 @@ class Crawler(object, metaclass=ProxyMetaclass):
                 ip = line.find('.tbBottomLine:nth-child(1)').text()
                 port = line.find('.tbBottomLine:nth-child(2)').text()
                 yield ':'.join([ip, port])
-    
+
     def crawl_goubanjia(self):
         """
         获取Goubanjia
-        :return:
+        :return: 代理
         """
         start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
         html = get_page(start_url)

+ 26 - 36
proxypool/db.py

@@ -9,37 +9,26 @@ class RedisClient(object):
     def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
         """
         初始化
-        :param host:
-        :param port:
-        :param password:
+        :param host: Redis 地址
+        :param port: Redis 端口
+        :param password: Redis密码
         """
         self.db = redis.StrictRedis(host=host, port=port, password=password)
-    
-    def top(self):
-        """
-        获取排名第一的代理
-        :return:
-        """
-        proxies = self.db.zrevrange(REDIS_KEY, 0, 0)
-        if proxies:
-            return proxies[0].decode('utf-8')
-        else:
-            raise PoolEmptyError
-    
+
     def add(self, proxy, score=INITIAL_SCORE):
         """
         添加代理,设置分数为最高
-        :param proxy:
-        :param score:
-        :return:
+        :param proxy: 代理
+        :param score: 分数
+        :return: 添加结果
         """
-        if self.db.zscore(proxy):
+        if self.db.zscore(REDIS_KEY, proxy):
             return self.db.zadd(REDIS_KEY, score, proxy)
-    
+
     def random(self):
         """
         随机获取有效代理,首先尝试获取最高分数代理,如果不存在,按照排名获取,否则异常
-        :return:
+        :return: 随机代理
         """
         result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
         if len(result):
@@ -50,48 +39,49 @@ class RedisClient(object):
                 return choice(result).decode('utf-8')
             else:
                 raise PoolEmptyError
-    
+
     def decrease(self, proxy):
         """
         代理值减一分,小于最小值则删除
-        :param proxy:
-        :return:
+        :param proxy: 代理
+        :return: 修改后的代理分数
         """
         score = self.db.zscore(REDIS_KEY, proxy)
         if score and score > MIN_SCORE:
-            self.db.zincrby(REDIS_KEY, proxy, -1)
             print('代理', proxy, '当前分数', score, '减1')
+            return self.db.zincrby(REDIS_KEY, proxy, -1)
         else:
-            self.db.zrem(REDIS_KEY, proxy)
             print('代理', proxy, '当前分数', score, '移除')
-    
+            return self.db.zrem(REDIS_KEY, proxy)
+
     def exists(self, proxy):
         """
         判断是否存在
-        :param proxy: 
-        :return: 
+        :param proxy: 代理
+        :return: 是否存在
         """
         return not self.db.zscore(REDIS_KEY, proxy) == None
-    
+
     def max(self, proxy):
         """
         将代理设置为MAX_SCORE
-        :param proxy:
-        :return:
+        :param proxy: 代理
+        :return: 设置结果
         """
+        print('代理', proxy, '可用,设置为', MAX_SCORE)
         return self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)
-    
+
     def count(self):
         """
         获取数量
-        :return:
+        :return: 数量
         """
         return self.db.zcard(REDIS_KEY)
-    
+
     def all(self):
         """
         获取全部代理
-        :return:
+        :return: 全部代理列表
         """
         all = self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
         return [item.decode('utf-8') for item in all]

+ 2 - 2
proxypool/setting.py

@@ -1,11 +1,11 @@
 # Redis数据库地址
-REDIS_HOST = 'DataCrawl-Pool.redis.cache.chinacloudapi.cn'
+REDIS_HOST = 'localhost'
 
 # Redis端口
 REDIS_PORT = 6379
 
 # Redis密码,如无填None
-REDIS_PASSWORD = None
+REDIS_PASSWORD = 'foobared'
 
 REDIS_KEY = 'proxies'
 

+ 6 - 7
proxypool/tester.py

@@ -12,12 +12,12 @@ from proxypool.setting import *
 class Tester(object):
     def __init__(self):
         self.redis = RedisClient()
-    
+
     async def test_single_proxy(self, proxy):
         """
         测试单个代理
-        :param proxy: 
-        :return: 
+        :param proxy:
+        :return:
         """
         conn = aiohttp.TCPConnector(verify_ssl=False)
         async with aiohttp.ClientSession(connector=conn) as session:
@@ -28,16 +28,15 @@ class Tester(object):
                 print('正在测试', proxy)
                 async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response:
                     if response.status in VALID_STATUS_CODES:
-                        self.redis.add(proxy)
+                        self.redis.max(proxy)
                         print('代理可用', proxy)
                     else:
                         self.redis.decrease(proxy)
                         print('请求响应码不合法,IP', proxy)
             except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
-                if self.redis.exists(proxy):
-                    self.redis.decrease(proxy)
+                self.redis.decrease(proxy)
                 print('代理请求失败', proxy)
-    
+
     def run(self):
         """
         测试主函数