crawler.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import json
  2. from .utils import get_page
  3. from pyquery import PyQuery as pq
  4. class ProxyMetaclass(type):
  5. def __new__(cls, name, bases, attrs):
  6. count = 0
  7. attrs['__CrawlFunc__'] = []
  8. for k, v in attrs.items():
  9. if 'crawl_' in k:
  10. attrs['__CrawlFunc__'].append(k)
  11. count += 1
  12. attrs['__CrawlFuncCount__'] = count
  13. return type.__new__(cls, name, bases, attrs)
  14. class Crawler(object, metaclass=ProxyMetaclass):
  15. def get_proxies(self, callback):
  16. proxies = []
  17. for proxy in eval("self.{}()".format(callback)):
  18. print('成功获取到代理', proxy)
  19. proxies.append(proxy)
  20. return proxies
  21. def crawl_kuaidaili(self):
  22. url = 'http://dev.kuaidaili.com/api/getproxy/?orderid=959961765125099&num=100&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=1&an_an=1&an_ha=1&quality=1&format=json&sep=2'
  23. html = get_page(url)
  24. if html:
  25. result = json.loads(html)
  26. ips = result.get('data').get('proxy_list')
  27. for ip in ips:
  28. yield ip
  29. def crawl_daili66(self, page_count=4):
  30. """
  31. 获取代理66
  32. :param page_count:
  33. :return:
  34. """
  35. start_url = 'http://www.66ip.cn/{}.html'
  36. urls = [start_url.format(page) for page in range(1, page_count + 1)]
  37. for url in urls:
  38. print('Crawling', url)
  39. html = get_page(url)
  40. if html:
  41. doc = pq(html)
  42. trs = doc('.containerbox table tr:gt(0)').items()
  43. for tr in trs:
  44. ip = tr.find('td:nth-child(1)').text()
  45. port = tr.find('td:nth-child(2)').text()
  46. yield ':'.join([ip, port])
  47. def crawl_proxy360(self):
  48. """
  49. 获取Proxy360
  50. :return:
  51. """
  52. start_url = 'http://www.proxy360.cn/Region/China'
  53. print('Crawling', start_url)
  54. html = get_page(start_url)
  55. if html:
  56. doc = pq(html)
  57. lines = doc('div[name="list_proxy_ip"]').items()
  58. for line in lines:
  59. ip = line.find('.tbBottomLine:nth-child(1)').text()
  60. port = line.find('.tbBottomLine:nth-child(2)').text()
  61. yield ':'.join([ip, port])
  62. def crawl_goubanjia(self):
  63. """
  64. 获取Goubanjia
  65. :return:
  66. """
  67. start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
  68. html = get_page(start_url)
  69. if html:
  70. doc = pq(html)
  71. tds = doc('td.ip').items()
  72. for td in tds:
  73. td.find('p').remove()
  74. yield td.text().replace(' ', '')