daili66.py 842 B

12345678910111213141516171819202122232425262728293031
  1. from pyquery import PyQuery as pq
  2. from proxypool.proxy import Proxy
  3. from proxypool.crawlers.base import BaseCrawler
  4. BASE_URL = 'http://www.636ip.cn/{page}.html'
  5. MAX_PAGE = 5
  6. class Daili66Crawler(BaseCrawler):
  7. """
  8. daili66 crawler, http://www.66ip.cn/1.html
  9. """
  10. urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
  11. def parse(self, html):
  12. """
  13. parse html file to get proxies
  14. :return:
  15. """
  16. doc = pq(html)
  17. trs = doc('.containerbox table tr:gt(0)').items()
  18. for tr in trs:
  19. host = tr.find('td:nth-child(1)').text()
  20. port = int(tr.find('td:nth-child(2)').text())
  21. yield Proxy(host=host, port=port)
  22. if __name__ == '__main__':
  23. crawler = Daili66Crawler()
  24. for proxy in crawler.crawl():
  25. print(proxy)