base.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. from retrying import retry
  2. import requests
  3. from loguru import logger
  4. from proxypool.setting import GET_TIMEOUT
  5. from fake_headers import Headers
  6. import time
  7. class BaseCrawler(object):
  8. urls = []
  9. @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
  10. def fetch(self, url, **kwargs):
  11. try:
  12. headers = Headers(headers=True).generate()
  13. kwargs.setdefault('timeout', GET_TIMEOUT)
  14. kwargs.setdefault('verify', False)
  15. kwargs.setdefault('headers', headers)
  16. response = requests.get(url, **kwargs)
  17. if response.status_code == 200:
  18. response.encoding = 'utf-8'
  19. return response.text
  20. except requests.ConnectionError:
  21. return
  22. @logger.catch
  23. def crawl(self):
  24. """
  25. crawl main method
  26. """
  27. for url in self.urls:
  28. logger.info(f'fetching {url}')
  29. html = self.fetch(url)
  30. time.sleep(.5)
  31. for proxy in self.parse(html):
  32. logger.info(f'fetched proxy {proxy.string()} from {url}')
  33. yield proxy