|  | @@ -0,0 +1,95 @@
 | 
											
												
													
														|  | 
 |  | +# -*- coding: UTF-8 -*-
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +'''
 | 
											
												
													
														|  | 
 |  | +'''
 | 
											
												
													
														|  | 
 |  | +import requests
 | 
											
												
													
														|  | 
 |  | +import time
 | 
											
												
													
														|  | 
 |  | +import threading
 | 
											
												
													
														|  | 
 |  | +import urllib3
 | 
											
												
													
														|  | 
 |  | +from fake_headers import Headers
 | 
											
												
													
														|  | 
 |  | +import uuid
 | 
											
												
													
														|  | 
 |  | +from geolite2 import geolite2
 | 
											
												
													
														|  | 
 |  | +ips = []
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# 爬数据的线程类
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def getChinaIP(ip='127.0.0.1'):
 | 
											
												
													
														|  | 
 |  | +    reader = geolite2.reader()
 | 
											
												
													
														|  | 
 |  | +    ip_info = reader.get(ip)
 | 
											
												
													
														|  | 
 |  | +    geolite2.close()
 | 
											
												
													
														|  | 
 |  | +    print(ip_info)
 | 
											
												
													
														|  | 
 |  | +    return True if ip_info['country']['iso_code'] == 'CN' else False
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +class CrawlThread(threading.Thread):
 | 
											
												
													
														|  | 
 |  | +    def __init__(self, proxyip):
 | 
											
												
													
														|  | 
 |  | +        super(CrawlThread, self).__init__()
 | 
											
												
													
														|  | 
 |  | +        self.proxyip = proxyip
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def run(self):
 | 
											
												
													
														|  | 
 |  | +        # 开始计时
 | 
											
												
													
														|  | 
 |  | +        pure_ip_address = self.proxyip.split(':')[0]
 | 
											
												
													
														|  | 
 |  | +        # 验证IP归属
 | 
											
												
													
														|  | 
 |  | +        if not getChinaIP(pure_ip_address):
 | 
											
												
													
														|  | 
 |  | +            # pass
 | 
											
												
													
														|  | 
 |  | +            raise ValueError('不是有效IP')
 | 
											
												
													
														|  | 
 |  | +        # 
 | 
											
												
													
														|  | 
 |  | +        start = time.time()
 | 
											
												
													
														|  | 
 |  | +        # 消除关闭证书验证的警告
 | 
											
												
													
														|  | 
 |  | +        urllib3.disable_warnings()
 | 
											
												
													
														|  | 
 |  | +        headers = Headers(headers=True).generate()
 | 
											
												
													
														|  | 
 |  | +        headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
 | 
											
												
													
														|  | 
 |  | +        headers['Pragma'] = 'no-cache'
 | 
											
												
													
														|  | 
 |  | +        headers['Host'] = 'bb.cf08tp.cn'
 | 
											
												
													
														|  | 
 |  | +        headers['x-forward-for'] = pure_ip_address
 | 
											
												
													
														|  | 
 |  | +        headers['Cookie'] = 'PHPSESSID={}'.format(
 | 
											
												
													
														|  | 
 |  | +            ''.join(str(uuid.uuid1()).split('-')))
 | 
											
												
													
														|  | 
 |  | +        print(headers)
 | 
											
												
													
														|  | 
 |  | +        html = requests.get(headers=headers, url=targetUrl, proxies={
 | 
											
												
													
														|  | 
 |  | +                            "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
 | 
											
												
													
														|  | 
 |  | +        # 结束计时
 | 
											
												
													
														|  | 
 |  | +        end = time.time()
 | 
											
												
													
														|  | 
 |  | +        # 输出内容
 | 
											
												
													
														|  | 
 |  | +        print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
 | 
											
												
													
														|  | 
 |  | +              "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# 获取代理IP的线程类
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +class GetIpThread(threading.Thread):
 | 
											
												
													
														|  | 
 |  | +    def __init__(self, fetchSecond):
 | 
											
												
													
														|  | 
 |  | +        super(GetIpThread, self).__init__()
 | 
											
												
													
														|  | 
 |  | +        self.fetchSecond = fetchSecond
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def run(self):
 | 
											
												
													
														|  | 
 |  | +        global ips
 | 
											
												
													
														|  | 
 |  | +        while True:
 | 
											
												
													
														|  | 
 |  | +            # 获取IP列表
 | 
											
												
													
														|  | 
 |  | +            res = requests.get(apiUrl).content.decode()
 | 
											
												
													
														|  | 
 |  | +            # 按照\n分割获取到的IP
 | 
											
												
													
														|  | 
 |  | +            ips = res.split('\n')
 | 
											
												
													
														|  | 
 |  | +            # 利用每一个IP
 | 
											
												
													
														|  | 
 |  | +            for proxyip in ips:
 | 
											
												
													
														|  | 
 |  | +                if proxyip.strip():
 | 
											
												
													
														|  | 
 |  | +                    # 开启一个线程
 | 
											
												
													
														|  | 
 |  | +                    # CrawlThread(proxyip).start()
 | 
											
												
													
														|  | 
 |  | +                    try:
 | 
											
												
													
														|  | 
 |  | +                        CrawlThread(proxyip).run()
 | 
											
												
													
														|  | 
 |  | +                        time.sleep(1.5)
 | 
											
												
													
														|  | 
 |  | +                    except Exception as e:
 | 
											
												
													
														|  | 
 |  | +                        print(e)
 | 
											
												
													
														|  | 
 |  | +            # 休眠
 | 
											
												
													
														|  | 
 |  | +            time.sleep(len(ips) /self.fetchSecond )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +if __name__ == '__main__':
 | 
											
												
													
														|  | 
 |  | +    # 获取IP的API接口
 | 
											
												
													
														|  | 
 |  | +    # apiUrl = "http://127.0.0.1:5555/all"
 | 
											
												
													
														|  | 
 |  | +    apiUrl = "http://127.0.0.1:5555/random"
 | 
											
												
													
														|  | 
 |  | +    # 要抓取的目标网站地址
 | 
											
												
													
														|  | 
 |  | +    targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
 | 
											
												
													
														|  | 
 |  | +    # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
 | 
											
												
													
														|  | 
 |  | +    fetchSecond = 5
 | 
											
												
													
														|  | 
 |  | +    # 开始自动获取IP
 | 
											
												
													
														|  | 
 |  | +    GetIpThread(fetchSecond).start()
 |