Apq
/
germey_ProxyPool
mirror of https://github.com/Python3WebSpider/ProxyPool.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
							import re
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler

BASE_URL = "http://www.xsdaili.cn/"
PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html"
MAX_PAGE = 3


class XiaoShuCrawler(BaseCrawler):
    """
    小舒代理 crawler, http://www.xsdaili.cn/
    """

    def __init__(self):
        """
        init urls
        """
        try:
            html = self.fetch(url=BASE_URL)
        except:
            self.urls = []
            return
        doc = pq(html)
        title = doc(".title:eq(0) a").items()
        latest_page = 0
        for t in title:
            res = re.search(r"/(\d+)\.html", t.attr("href"))
            latest_page = int(res.group(1)) if res else 0
        if latest_page:
            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
                latest_page - MAX_PAGE, latest_page)]
        else:
            self.urls = []

    def parse(self, html):
        """
        parse html file to get proxies
        :return:
        """
        doc = pq(html)
        contents = doc('.cont').text()
        contents = contents.split("\n")
        for content in contents:
            c = content[:content.find("@")]
            host, port = c.split(":")
            yield Proxy(host=host, port=int(port))


if __name__ == '__main__':
    crawler = XiaoShuCrawler()
    for proxy in crawler.crawl():
        print(proxy)