Germey 8 years ago
parent
commit
83fccfb064
4 changed files with 97 additions and 36 deletions
  1. 77 25
      proxypool/crawler.py
  2. 5 5
      proxypool/setting.py
  3. 7 3
      proxypool/tester.py
  4. 8 3
      run.py

+ 77 - 25
proxypool/crawler.py

@@ -1,5 +1,5 @@
 import json
-
+import re
 from .utils import get_page
 from pyquery import PyQuery as pq
 
@@ -23,32 +23,14 @@ class Crawler(object, metaclass=ProxyMetaclass):
             print('成功获取到代理', proxy)
             proxies.append(proxy)
         return proxies
-
-    def crawl_xdaili(self):
-        """
-        获取讯代理
-        :return: 代理
-        """
-        url = 'http://www.xdaili.cn/ipagent/greatRecharge/getGreatIp?spiderId=da289b78fec24f19b392e04106253f2a&orderno=YZ20177140586mTTnd7&returnType=2&count=20'
-        html = get_page(url)
-        if html:
-            result = json.loads(html)
-            proxies = result.get('RESULT')
-            for proxy in proxies:
-                yield proxy.get('ip') + ':' + proxy.get('port')
-
-    def crawl_kuaidaili(self):
-        """
-        获取快代理
-        :return: 代理
-        """
-        url = 'http://dev.kuaidaili.com/api/getproxy/?orderid=959961765125099&num=100&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=1&an_an=1&an_ha=1&quality=1&format=json&sep=2'
+        
+    def crawl_daxiang(self):
+        url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=100&filter=on'
         html = get_page(url)
         if html:
-            result = json.loads(html)
-            proxies = result.get('data').get('proxy_list')
-            for proxy in proxies:
-                yield proxy
+            urls = html.split('\n')
+            for url in urls:
+                yield url
 
     def crawl_daili66(self, page_count=4):
         """
@@ -98,3 +80,73 @@ class Crawler(object, metaclass=ProxyMetaclass):
             for td in tds:
                 td.find('p').remove()
                 yield td.text().replace(' ', '')
+
+    def crawl_ip181(self):
+        start_url = 'http://www.ip181.com/'
+        html = get_page(start_url)
+        ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
+        # \s* 匹配空格,起到换行作用
+        re_ip_adress = ip_adress.findall(html)
+        for adress,port in re_ip_adress:
+            result = adress + ':' + port
+            yield result.replace(' ', '')
+
+
+    def crawl_ip3366(self):
+        for page in range(1, 4):
+            start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
+            html = get_page(start_url)
+            ip_adress = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
+            # \s * 匹配空格,起到换行作用
+            re_ip_adress = ip_adress.findall(html)
+            for adress, port in re_ip_adress:
+                result = adress+':'+ port
+                yield result.replace(' ', '')
+
+
+    def crawl_data5u(self):
+        for i in ['gngn', 'gnpt']:
+            start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
+            html = get_page(start_url)
+            ip_adress = re.compile(' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>')
+            # \s * 匹配空格,起到换行作用
+            re_ip_adress = ip_adress.findall(html)
+            for adress, port in re_ip_adress:
+                result = adress+':'+port
+                yield result.replace(' ','')
+
+    def crawl_kxdaili(self):
+        for i in range(1, 4):
+            start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i)
+            html = get_page(start_url)
+            ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
+            # \s* 匹配空格,起到换行作用
+            re_ip_adress = ip_adress.findall(html)
+            for adress, port in re_ip_adress:
+                result = adress + ':' + port
+                yield result.replace(' ', '')
+
+
+    def crawl_premproxy(self):
+        for i in ['China-01','China-02','China-03','China-04','Taiwan-01']:
+            start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(i)
+            html = get_page(start_url)
+            if html:
+                ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>') 
+                re_ip_adress = ip_adress.findall(html)
+                for adress_port in re_ip_adress:
+                    yield adress_port.replace(' ','')
+
+    def crawl_xroxy(self):
+        for i in ['CN','TW']:
+            start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(i)
+            html = get_page(start_url)
+            if html:
+                ip_adress1 = re.compile("title='View this Proxy details'>\s*(.*).*")
+                re_ip_adress1 = ip_adress1.findall(html)
+                ip_adress2 = re.compile("title='Select proxies with port number .*'>(.*)</a>") 
+                re_ip_adress2 = ip_adress2.findall(html)
+                for adress,port in zip(re_ip_adress1,re_ip_adress2):
+                    adress_port = adress+':'+port
+                    yield adress_port.replace(' ','')
+

+ 5 - 5
proxypool/setting.py

@@ -1,11 +1,11 @@
 # Redis数据库地址
-REDIS_HOST = 'localhost'
+REDIS_HOST = 'DataCrawl-Pool.redis.cache.chinacloudapi.cn'
 
 # Redis端口
 REDIS_PORT = 6379
 
 # Redis密码,如无填None
-REDIS_PASSWORD = 'foobared'
+REDIS_PASSWORD = 'A0y1VJ6t9B7R5d6id1/2Rk/aDvZXjdwAR/tWylORuwA='
 
 REDIS_KEY = 'proxies'
 
@@ -14,15 +14,15 @@ MAX_SCORE = 100
 MIN_SCORE = 0
 INITIAL_SCORE = 10
 
-VALID_STATUS_CODES = [200]
+VALID_STATUS_CODES = [200, 302]
 
 # 代理池数量界限
-POOL_UPPER_THRESHOLD = 10000
+POOL_UPPER_THRESHOLD = 50000
 
 # 检查周期
 TESTER_CYCLE = 20
 # 获取周期
-GETTER_CYCLE = 20
+GETTER_CYCLE = 300
 
 # 测试API,建议抓哪个网站测哪个
 TEST_URL = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2145291155&containerid=1076032145291155&page=14'

+ 7 - 3
proxypool/tester.py

@@ -1,6 +1,6 @@
 import asyncio
 import aiohttp
-
+import time
 try:
     from aiohttp import ClientError
 except:
@@ -46,7 +46,11 @@ class Tester(object):
         try:
             proxies = self.redis.all()
             loop = asyncio.get_event_loop()
-            tasks = [self.test_single_proxy(proxy) for proxy in proxies]
-            loop.run_until_complete(asyncio.wait(tasks))
+            batch_size = 100
+            for i in range(0, len(proxies), batch_size):
+                test_proxies = proxies[i:i + batch_size]
+                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
+                loop.run_until_complete(asyncio.wait(tasks))
+                time.sleep(5)
         except Exception as e:
             print('测试器发生错误', e.args)

+ 8 - 3
run.py

@@ -1,9 +1,14 @@
 from proxypool.scheduler import Scheduler
-
+import sys
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 
 def main():
-    s = Scheduler()
-    s.run()
+	try:
+	    s = Scheduler()
+	    s.run()
+	except:
+	    main()    
 
 
 if __name__ == '__main__':