base.py 769 B

12345678910111213141516171819202122232425262728
  1. from retrying import retry
  2. import requests
  3. from loguru import logger
  4. class BaseCrawler(object):
  5. urls = []
  6. @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None)
  7. def fetch(self, url, **kwargs):
  8. try:
  9. response = requests.get(url, **kwargs)
  10. if response.status_code == 200:
  11. return response.text
  12. except requests.ConnectionError:
  13. return
  14. @logger.catch
  15. def crawl(self):
  16. """
  17. crawl main method
  18. """
  19. for url in self.urls:
  20. logger.info(f'fetching {url}')
  21. html = self.fetch(url)
  22. for proxy in self.parse(html):
  23. logger.info(f'fetched proxy {proxy.string()}')
  24. yield proxy