crawler.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import json
  2. import re
  3. from .utils import get_page
  4. from pyquery import PyQuery as pq
  5. class ProxyMetaclass(type):
  6. def __new__(cls, name, bases, attrs):
  7. count = 0
  8. attrs['__CrawlFunc__'] = []
  9. for k, v in attrs.items():
  10. if 'crawl_' in k:
  11. attrs['__CrawlFunc__'].append(k)
  12. count += 1
  13. attrs['__CrawlFuncCount__'] = count
  14. return type.__new__(cls, name, bases, attrs)
  15. class Crawler(object, metaclass=ProxyMetaclass):
  16. def get_proxies(self, callback):
  17. proxies = []
  18. for proxy in eval("self.{}()".format(callback)):
  19. print('成功获取到代理', proxy)
  20. proxies.append(proxy)
  21. return proxies
  22. def crawl_daxiang(self):
  23. url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=50&filter=on'
  24. html = get_page(url)
  25. if html:
  26. urls = html.split('\n')
  27. for url in urls:
  28. yield url
  29. def crawl_daili66(self, page_count=4):
  30. """
  31. 获取代理66
  32. :param page_count: 页码
  33. :return: 代理
  34. """
  35. start_url = 'http://www.66ip.cn/{}.html'
  36. urls = [start_url.format(page) for page in range(1, page_count + 1)]
  37. for url in urls:
  38. print('Crawling', url)
  39. html = get_page(url)
  40. if html:
  41. doc = pq(html)
  42. trs = doc('.containerbox table tr:gt(0)').items()
  43. for tr in trs:
  44. ip = tr.find('td:nth-child(1)').text()
  45. port = tr.find('td:nth-child(2)').text()
  46. yield ':'.join([ip, port])
  47. def crawl_proxy360(self):
  48. """
  49. 获取Proxy360
  50. :return: 代理
  51. """
  52. start_url = 'http://www.proxy360.cn/Region/China'
  53. print('Crawling', start_url)
  54. html = get_page(start_url)
  55. if html:
  56. doc = pq(html)
  57. lines = doc('div[name="list_proxy_ip"]').items()
  58. for line in lines:
  59. ip = line.find('.tbBottomLine:nth-child(1)').text()
  60. port = line.find('.tbBottomLine:nth-child(2)').text()
  61. yield ':'.join([ip, port])
  62. def crawl_goubanjia(self):
  63. """
  64. 获取Goubanjia
  65. :return: 代理
  66. """
  67. start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
  68. html = get_page(start_url)
  69. if html:
  70. doc = pq(html)
  71. tds = doc('td.ip').items()
  72. for td in tds:
  73. td.find('p').remove()
  74. yield td.text().replace(' ', '')
  75. def crawl_ip181(self):
  76. start_url = 'http://www.ip181.com/'
  77. html = get_page(start_url)
  78. ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
  79. # \s* 匹配空格,起到换行作用
  80. re_ip_adress = ip_adress.findall(html)
  81. for adress,port in re_ip_adress:
  82. result = adress + ':' + port
  83. yield result.replace(' ', '')
  84. def crawl_ip3366(self):
  85. for page in range(1, 4):
  86. start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
  87. html = get_page(start_url)
  88. ip_adress = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
  89. # \s * 匹配空格,起到换行作用
  90. re_ip_adress = ip_adress.findall(html)
  91. for adress, port in re_ip_adress:
  92. result = adress+':'+ port
  93. yield result.replace(' ', '')
  94. def crawl_data5u(self):
  95. for i in ['gngn', 'gnpt']:
  96. start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
  97. html = get_page(start_url)
  98. ip_adress = re.compile(' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>')
  99. # \s * 匹配空格,起到换行作用
  100. re_ip_adress = ip_adress.findall(html)
  101. for adress, port in re_ip_adress:
  102. result = adress+':'+port
  103. yield result.replace(' ','')
  104. def crawl_kxdaili(self):
  105. for i in range(1, 4):
  106. start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i)
  107. html = get_page(start_url)
  108. ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
  109. # \s* 匹配空格,起到换行作用
  110. re_ip_adress = ip_adress.findall(html)
  111. for adress, port in re_ip_adress:
  112. result = adress + ':' + port
  113. yield result.replace(' ', '')
  114. def crawl_premproxy(self):
  115. for i in ['China-01','China-02','China-03','China-04','Taiwan-01']:
  116. start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(i)
  117. html = get_page(start_url)
  118. if html:
  119. ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>')
  120. re_ip_adress = ip_adress.findall(html)
  121. for adress_port in re_ip_adress:
  122. yield adress_port.replace(' ','')
  123. def crawl_xroxy(self):
  124. for i in ['CN','TW']:
  125. start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(i)
  126. html = get_page(start_url)
  127. if html:
  128. ip_adress1 = re.compile("title='View this Proxy details'>\s*(.*).*")
  129. re_ip_adress1 = ip_adress1.findall(html)
  130. ip_adress2 = re.compile("title='Select proxies with port number .*'>(.*)</a>")
  131. re_ip_adress2 = ip_adress2.findall(html)
  132. for adress,port in zip(re_ip_adress1,re_ip_adress2):
  133. adress_port = adress+':'+port
  134. yield adress_port.replace(' ','')