ip3366.py 870 B

12345678910111213141516171819202122232425262728293031
  1. from proxypool.crawlers.base import BaseCrawler
  2. from proxypool.schemas.proxy import Proxy
  3. import re
  4. MAX_PAGE = 5
  5. BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
  6. class IP3366Crawler(BaseCrawler):
  7. """
  8. ip3366 crawler, http://www.ip3366.net/
  9. """
  10. urls = [BASE_URL.format(page=i) for i in range(1, 8)]
  11. def parse(self, html):
  12. """
  13. parse html file to get proxies
  14. :return:
  15. """
  16. ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
  17. # \s * 匹配空格,起到换行作用
  18. re_ip_address = ip_address.findall(html)
  19. for address, port in re_ip_address:
  20. proxy = Proxy(host=address.strip(), port=int(port.strip()))
  21. yield proxy
  22. if __name__ == '__main__':
  23. crawler = IP3366Crawler()
  24. for proxy in crawler.crawl():
  25. print(proxy)