Germey 8 years ago
parent
commit
133b4e0890
8 changed files with 41 additions and 23 deletions
  1. 4 0
      importer.py
  2. 2 2
      proxypool/crawler.py
  3. 14 6
      proxypool/db.py
  4. 2 1
      proxypool/getter.py
  5. 1 0
      proxypool/scheduler.py
  6. 1 1
      proxypool/setting.py
  7. 10 8
      proxypool/tester.py
  8. 7 5
      run.py

+ 4 - 0
importer.py

@@ -0,0 +1,4 @@
+from proxypool.importer import scan
+
+if __name__ == '__main__':
+    scan()

+ 2 - 2
proxypool/crawler.py

@@ -1,4 +1,4 @@
-ccimport json
+import json
 import re
 from .utils import get_page
 from pyquery import PyQuery as pq
@@ -25,7 +25,7 @@ class Crawler(object, metaclass=ProxyMetaclass):
         return proxies
         
     def crawl_daxiang(self):
-        url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=100&filter=on'
+        url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=50&filter=on'
         html = get_page(url)
         if html:
             urls = html.split('\n')

+ 14 - 6
proxypool/db.py

@@ -3,6 +3,7 @@ from proxypool.error import PoolEmptyError
 from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
 from proxypool.setting import MAX_SCORE, MIN_SCORE, INITIAL_SCORE
 from random import choice
+import re
 
 
 class RedisClient(object):
@@ -22,6 +23,9 @@ class RedisClient(object):
         :param score: 分数
         :return: 添加结果
         """
+        if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy):
+            print('代理不符合规范', proxy, '丢弃')
+            return
         if not self.db.zscore(REDIS_KEY, proxy):
             return self.db.zadd(REDIS_KEY, score, proxy)
     
@@ -84,14 +88,18 @@ class RedisClient(object):
         :return: 全部代理列表
         """
         return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
+    
+    def batch(self, start, stop):
+        """
+        批量获取
+        :param start: 开始索引
+        :param stop: 结束索引
+        :return: 代理列表
+        """
+        return self.db.zrevrange(REDIS_KEY, start, stop - 1)
 
 
 if __name__ == '__main__':
     conn = RedisClient()
-    result = conn.all()
+    result = conn.batch(680, 688)
     print(result)
-    random = conn.random()
-    print('Random', random)
-    top = conn.top()
-    print('Top', top)
-    conn.decrease('a')

+ 2 - 1
proxypool/getter.py

@@ -2,7 +2,7 @@ from proxypool.tester import Tester
 from proxypool.db import RedisClient
 from proxypool.crawler import Crawler
 from proxypool.setting import *
-
+import sys
 
 class Getter():
     def __init__(self):
@@ -25,5 +25,6 @@ class Getter():
                 callback = self.crawler.__CrawlFunc__[callback_label]
                 # 获取代理
                 proxies = self.crawler.get_proxies(callback)
+                sys.stdout.flush()
                 for proxy in proxies:
                     self.redis.add(proxy)

+ 1 - 0
proxypool/scheduler.py

@@ -6,6 +6,7 @@ from proxypool.tester import Tester
 from proxypool.db import RedisClient
 from proxypool.setting import *
 
+
 class Scheduler():
     def schedule_tester(self, cycle=TESTER_CYCLE):
         """

+ 1 - 1
proxypool/setting.py

@@ -37,4 +37,4 @@ GETTER_ENABLED = True
 API_ENABLED = True
 
 # 最大批测试量
-BATCH_TEST_SIZE = 100
+BATCH_TEST_SIZE = 10

+ 10 - 8
proxypool/tester.py

@@ -1,10 +1,7 @@
 import asyncio
 import aiohttp
 import time
-<<<<<<< HEAD
-=======
-
->>>>>>> dbb6bb89903b1cd158470ac472a5a930f7f0978b
+import sys
 try:
     from aiohttp import ClientError
 except:
@@ -48,12 +45,17 @@ class Tester(object):
         """
         print('测试器开始运行')
         try:
-            proxies = self.redis.all()
-            loop = asyncio.get_event_loop()
-            for i in range(0, len(proxies), BATCH_TEST_SIZE):
-                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
+            count = self.redis.count()
+            print('当前剩余', count, '个代理')
+            for i in range(0, count, BATCH_TEST_SIZE):
+                start = i
+                stop = min(i + BATCH_TEST_SIZE, count)
+                print('正在测试第', start + 1, '-', stop, '个代理')
+                test_proxies = self.redis.batch(start, stop)
+                loop = asyncio.get_event_loop()
                 tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                 loop.run_until_complete(asyncio.wait(tasks))
+                sys.stdout.flush()
                 time.sleep(5)
         except Exception as e:
             print('测试器发生错误', e.args)

+ 7 - 5
run.py

@@ -1,14 +1,16 @@
 from proxypool.scheduler import Scheduler
 import sys
 import io
+
 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 
+
 def main():
-	try:
-	    s = Scheduler()
-	    s.run()
-	except:
-	    main()    
+    try:
+        s = Scheduler()
+        s.run()
+    except:
+        main()
 
 
 if __name__ == '__main__':