getter.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from .utils import get_page
  2. from pyquery import PyQuery as pq
  3. class ProxyMetaclass(type):
  4. def __new__(cls, name, bases, attrs):
  5. count = 0
  6. attrs['__CrawlFunc__'] = []
  7. for k, v in attrs.items():
  8. if 'crawl_' in k:
  9. attrs['__CrawlFunc__'].append(k)
  10. count += 1
  11. attrs['__CrawlFuncCount__'] = count
  12. return type.__new__(cls, name, bases, attrs)
  13. class FreeProxyGetter(object, metaclass=ProxyMetaclass):
  14. def get_raw_proxies(self, callback):
  15. proxies = []
  16. print('Callback', callback)
  17. for proxy in eval("self.{}()".format(callback)):
  18. print('Getting', proxy, 'from', callback)
  19. proxies.append(proxy)
  20. return proxies
  21. def crawl_daili66(self, page_count=4):
  22. start_url = 'http://www.66ip.cn/{}.html'
  23. urls = [start_url.format(page) for page in range(1, page_count + 1)]
  24. for url in urls:
  25. print('Crawling', url)
  26. html = get_page(url)
  27. if html:
  28. doc = pq(html)
  29. trs = doc('.containerbox table tr:gt(0)').items()
  30. for tr in trs:
  31. ip = tr.find('td:nth-child(1)').text()
  32. port = tr.find('td:nth-child(2)').text()
  33. yield ':'.join([ip, port])
  34. def crawl_proxy360(self):
  35. start_url = 'http://www.proxy360.cn/Region/China'
  36. print('Crawling', start_url)
  37. html = get_page(start_url)
  38. if html:
  39. doc = pq(html)
  40. lines = doc('div[name="list_proxy_ip"]').items()
  41. for line in lines:
  42. ip = line.find('.tbBottomLine:nth-child(1)').text()
  43. port = line.find('.tbBottomLine:nth-child(2)').text()
  44. yield ':'.join([ip, port])
  45. def crawl_goubanjia(self):
  46. start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
  47. html = get_page(start_url)
  48. if html:
  49. doc = pq(html)
  50. tds = doc('td.ip').items()
  51. for td in tds:
  52. td.find('p').remove()
  53. yield td.text().replace(' ', '')
  54. def crawl_haoip(self):
  55. start_url = 'http://haoip.cc/tiqu.htm'
  56. html = get_page(start_url)
  57. if html:
  58. doc = pq(html)
  59. results = doc('.row .col-xs-12').html().split('<br/>')
  60. for result in results:
  61. if result: yield result.strip()