|
@@ -0,0 +1,95 @@
|
|
|
+# -*- coding: UTF-8 -*-
|
|
|
+
|
|
|
+'''
|
|
|
+'''
|
|
|
+import requests
|
|
|
+import time
|
|
|
+import threading
|
|
|
+import urllib3
|
|
|
+from fake_headers import Headers
|
|
|
+import uuid
|
|
|
+from geolite2 import geolite2
|
|
|
+ips = []
|
|
|
+
|
|
|
+# 爬数据的线程类
|
|
|
+
|
|
|
+def getChinaIP(ip='127.0.0.1'):
|
|
|
+ reader = geolite2.reader()
|
|
|
+ ip_info = reader.get(ip)
|
|
|
+ geolite2.close()
|
|
|
+ print(ip_info)
|
|
|
+ return True if ip_info['country']['iso_code'] == 'CN' else False
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class CrawlThread(threading.Thread):
|
|
|
+ def __init__(self, proxyip):
|
|
|
+ super(CrawlThread, self).__init__()
|
|
|
+ self.proxyip = proxyip
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ # 开始计时
|
|
|
+ pure_ip_address = self.proxyip.split(':')[0]
|
|
|
+ # 验证IP归属
|
|
|
+ if not getChinaIP(pure_ip_address):
|
|
|
+ # pass
|
|
|
+ raise ValueError('不是有效IP')
|
|
|
+ #
|
|
|
+ start = time.time()
|
|
|
+ # 消除关闭证书验证的警告
|
|
|
+ urllib3.disable_warnings()
|
|
|
+ headers = Headers(headers=True).generate()
|
|
|
+ headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
|
|
|
+ headers['Pragma'] = 'no-cache'
|
|
|
+ headers['Host'] = 'bb.cf08tp.cn'
|
|
|
+ headers['x-forward-for'] = pure_ip_address
|
|
|
+ headers['Cookie'] = 'PHPSESSID={}'.format(
|
|
|
+ ''.join(str(uuid.uuid1()).split('-')))
|
|
|
+ print(headers)
|
|
|
+ html = requests.get(headers=headers, url=targetUrl, proxies={
|
|
|
+ "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
|
|
|
+ # 结束计时
|
|
|
+ end = time.time()
|
|
|
+ # 输出内容
|
|
|
+ print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
|
|
|
+ "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
|
|
|
+
|
|
|
+# 获取代理IP的线程类
|
|
|
+
|
|
|
+
|
|
|
+class GetIpThread(threading.Thread):
|
|
|
+ def __init__(self, fetchSecond):
|
|
|
+ super(GetIpThread, self).__init__()
|
|
|
+ self.fetchSecond = fetchSecond
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ global ips
|
|
|
+ while True:
|
|
|
+ # 获取IP列表
|
|
|
+ res = requests.get(apiUrl).content.decode()
|
|
|
+ # 按照\n分割获取到的IP
|
|
|
+ ips = res.split('\n')
|
|
|
+ # 利用每一个IP
|
|
|
+ for proxyip in ips:
|
|
|
+ if proxyip.strip():
|
|
|
+ # 开启一个线程
|
|
|
+ # CrawlThread(proxyip).start()
|
|
|
+ try:
|
|
|
+ CrawlThread(proxyip).run()
|
|
|
+ time.sleep(1.5)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ # 休眠
|
|
|
+ time.sleep(len(ips) /self.fetchSecond )
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # 获取IP的API接口
|
|
|
+ # apiUrl = "http://127.0.0.1:5555/all"
|
|
|
+ apiUrl = "http://127.0.0.1:5555/random"
|
|
|
+ # 要抓取的目标网站地址
|
|
|
+ targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
|
|
|
+ # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
|
|
|
+ fetchSecond = 5
|
|
|
+ # 开始自动获取IP
|
|
|
+ GetIpThread(fetchSecond).start()
|