xiaoshudaili.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. import re
  2. from pyquery import PyQuery as pq
  3. from proxypool.schemas.proxy import Proxy
  4. from proxypool.crawlers.base import BaseCrawler
  5. BASE_URL = "http://www.xsdaili.cn/"
  6. PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html"
  7. MAX_PAGE = 3
  8. class XiaoShuCrawler(BaseCrawler):
  9. """
  10. 小舒代理 crawler, http://www.xsdaili.cn/
  11. """
  12. def __init__(self):
  13. """
  14. init urls
  15. """
  16. try:
  17. html = self.fetch(url=BASE_URL)
  18. except:
  19. self.urls = []
  20. return
  21. doc = pq(html)
  22. title = doc(".title:eq(0) a").items()
  23. latest_page = 0
  24. for t in title:
  25. res = re.search(r"/(\d+)\.html", t.attr("href"))
  26. latest_page = int(res.group(1)) if res else 0
  27. if latest_page:
  28. self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
  29. latest_page - MAX_PAGE, latest_page)]
  30. else:
  31. self.urls = []
  32. def parse(self, html):
  33. """
  34. parse html file to get proxies
  35. :return:
  36. """
  37. doc = pq(html)
  38. contents = doc('.cont').text()
  39. contents = contents.split("\n")
  40. for content in contents:
  41. c = content[:content.find("@")]
  42. host, port = c.split(":")
  43. yield Proxy(host=host, port=int(port))
  44. if __name__ == '__main__':
  45. crawler = XiaoShuCrawler()
  46. for proxy in crawler.crawl():
  47. print(proxy)