| 12345678910111213141516171819202122232425262728293031 |
- from pyquery import PyQuery as pq
- from proxypool.proxy import Proxy
- from proxypool.crawlers.base import BaseCrawler
- BASE_URL = 'http://www.636ip.cn/{page}.html'
- MAX_PAGE = 5
- class Daili66Crawler(BaseCrawler):
- """
- daili66 crawler, http://www.66ip.cn/1.html
- """
- urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
-
-
- def parse(self, html):
- """
- parse html file to get proxies
- :return:
- """
- doc = pq(html)
- trs = doc('.containerbox table tr:gt(0)').items()
- for tr in trs:
- host = tr.find('td:nth-child(1)').text()
- port = int(tr.find('td:nth-child(2)').text())
- yield Proxy(host=host, port=port)
- if __name__ == '__main__':
- crawler = Daili66Crawler()
- for proxy in crawler.crawl():
- print(proxy)
|