getter.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. from .utils import get_page
  2. from pyquery import PyQuery as pq
  3. class ProxyMetaclass(type):
  4. """
  5. 元类,在FreeProxyGetter类中加入
  6. __CrawlFunc__和__CrawlFuncCount__
  7. 两个参数,分别表示爬虫函数,和爬虫函数的数量。
  8. """
  9. def __new__(cls, name, bases, attrs):
  10. count = 0
  11. attrs['__CrawlFunc__'] = []
  12. for k, v in attrs.items():
  13. if 'crawl_' in k:
  14. attrs['__CrawlFunc__'].append(k)
  15. count += 1
  16. attrs['__CrawlFuncCount__'] = count
  17. return type.__new__(cls, name, bases, attrs)
  18. class FreeProxyGetter(object, metaclass=ProxyMetaclass):
  19. def get_raw_proxies(self, callback):
  20. proxies = []
  21. print('Callback', callback)
  22. for proxy in eval("self.{}()".format(callback)):
  23. print('Getting', proxy, 'from', callback)
  24. proxies.append(proxy)
  25. return proxies
  26. def crawl_daili66(self, page_count=4):
  27. start_url = 'http://www.66ip.cn/{}.html'
  28. urls = [start_url.format(page) for page in range(1, page_count + 1)]
  29. for url in urls:
  30. print('Crawling', url)
  31. html = get_page(url)
  32. if html:
  33. doc = pq(html)
  34. trs = doc('.containerbox table tr:gt(0)').items()
  35. for tr in trs:
  36. ip = tr.find('td:nth-child(1)').text()
  37. port = tr.find('td:nth-child(2)').text()
  38. yield ':'.join([ip, port])
  39. def crawl_proxy360(self):
  40. start_url = 'http://www.proxy360.cn/Region/China'
  41. print('Crawling', start_url)
  42. html = get_page(start_url)
  43. if html:
  44. doc = pq(html)
  45. lines = doc('div[name="list_proxy_ip"]').items()
  46. for line in lines:
  47. ip = line.find('.tbBottomLine:nth-child(1)').text()
  48. port = line.find('.tbBottomLine:nth-child(2)').text()
  49. yield ':'.join([ip, port])
  50. def crawl_goubanjia(self):
  51. start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
  52. html = get_page(start_url)
  53. if html:
  54. doc = pq(html)
  55. tds = doc('td.ip').items()
  56. for td in tds:
  57. td.find('p').remove()
  58. yield td.text().replace(' ', '')
  59. def crawl_haoip(self):
  60. start_url = 'http://haoip.cc/tiqu.htm'
  61. html = get_page(start_url)
  62. if html:
  63. doc = pq(html)
  64. results = doc('.row .col-xs-12').html().split('<br/>')
  65. for result in results:
  66. if result: yield result.strip()