usage2.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. # -*- coding: UTF-8 -*-
  2. '''
  3. '''
  4. import requests
  5. import time
  6. import threading
  7. import urllib3
  8. from fake_headers import Headers
  9. import uuid
  10. from geolite2 import geolite2
  11. ips = []
  12. # 爬数据的线程类
  13. def getChinaIP(ip='127.0.0.1'):
  14. reader = geolite2.reader()
  15. ip_info = reader.get(ip)
  16. geolite2.close()
  17. print(ip_info)
  18. return True if ip_info['country']['iso_code'] == 'CN' else False
  19. class CrawlThread(threading.Thread):
  20. def __init__(self, proxyip):
  21. super(CrawlThread, self).__init__()
  22. self.proxyip = proxyip
  23. def run(self):
  24. # 开始计时
  25. pure_ip_address = self.proxyip.split(':')[0]
  26. # 验证IP归属
  27. if not getChinaIP(pure_ip_address):
  28. # pass
  29. raise ValueError('不是有效IP')
  30. #
  31. start = time.time()
  32. # 消除关闭证书验证的警告
  33. urllib3.disable_warnings()
  34. headers = Headers(headers=True).generate()
  35. headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
  36. headers['Pragma'] = 'no-cache'
  37. headers['Host'] = 'bb.cf08tp.cn'
  38. headers['x-forward-for'] = pure_ip_address
  39. headers['Cookie'] = 'PHPSESSID={}'.format(
  40. ''.join(str(uuid.uuid1()).split('-')))
  41. print(headers)
  42. html = requests.get(headers=headers, url=targetUrl, proxies={
  43. "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
  44. # 结束计时
  45. end = time.time()
  46. # 输出内容
  47. print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
  48. "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
  49. # 获取代理IP的线程类
  50. class GetIpThread(threading.Thread):
  51. def __init__(self, fetchSecond):
  52. super(GetIpThread, self).__init__()
  53. self.fetchSecond = fetchSecond
  54. def run(self):
  55. global ips
  56. while True:
  57. # 获取IP列表
  58. res = requests.get(apiUrl).content.decode()
  59. # 按照\n分割获取到的IP
  60. ips = res.split('\n')
  61. # 利用每一个IP
  62. for proxyip in ips:
  63. if proxyip.strip():
  64. # 开启一个线程
  65. # CrawlThread(proxyip).start()
  66. try:
  67. CrawlThread(proxyip).run()
  68. time.sleep(1.5)
  69. except Exception as e:
  70. print(e)
  71. # 休眠
  72. time.sleep(len(ips) /self.fetchSecond )
  73. if __name__ == '__main__':
  74. # 获取IP的API接口
  75. # apiUrl = "http://127.0.0.1:5555/all"
  76. apiUrl = "http://127.0.0.1:5555/random"
  77. # 要抓取的目标网站地址
  78. targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
  79. # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
  80. fetchSecond = 5
  81. # 开始自动获取IP
  82. GetIpThread(fetchSecond).start()