iphai.py 1014 B

12345678910111213141516171819202122232425262728293031323334
  1. from proxypool.crawlers.base import BaseCrawler
  2. from proxypool.schemas.proxy import Proxy
  3. import re
  4. BASE_URL = 'http://www.iphai.com/'
  5. class IPHaiCrawler(BaseCrawler):
  6. """
  7. iphai crawler, http://www.iphai.com/
  8. """
  9. urls = [BASE_URL]
  10. def parse(self, html):
  11. """
  12. parse html file to get proxies
  13. :return:
  14. """
  15. find_tr = re.compile('<tr>(.*?)</tr>', re.S)
  16. trs = find_tr.findall(html)
  17. for s in range(1, len(trs)):
  18. find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
  19. re_ip_address = find_ip.findall(trs[s])
  20. find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
  21. re_port = find_port.findall(trs[s])
  22. for address, port in zip(re_ip_address, re_port):
  23. proxy = Proxy(host=address.strip(), port=int(port.strip()))
  24. yield proxy
  25. if __name__ == '__main__':
  26. crawler = IPHaiCrawler()
  27. for proxy in crawler.crawl():
  28. print(proxy)