Apq
/
germey_ProxyPool
mirror of https://github.com/Python3WebSpider/ProxyPool.git


			
							12345678910111213141516171819202122232425262728293031
							from pyquery import PyQuery as pq
from proxypool.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler


BASE_URL = 'http://www.636ip.cn/{page}.html'
MAX_PAGE = 5

class Daili66Crawler(BaseCrawler):
    """
    daili66 crawler, http://www.66ip.cn/1.html
    """
    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
    
    
    def parse(self, html):
        """
        parse html file to get proxies
        :return:
        """
        doc = pq(html)
        trs = doc('.containerbox table tr:gt(0)').items()
        for tr in trs:
            host = tr.find('td:nth-child(1)').text()
            port = int(tr.find('td:nth-child(2)').text())
            yield Proxy(host=host, port=port)

if __name__ == '__main__':
    crawler = Daili66Crawler()
    for proxy in crawler.crawl():
        print(proxy)