|
@@ -6,13 +6,13 @@ from proxypool.crawlers.base import BaseCrawler
|
|
|
BASE_URL = 'http://www.664ip.cn/{page}.html'
|
|
BASE_URL = 'http://www.664ip.cn/{page}.html'
|
|
|
MAX_PAGE = 5
|
|
MAX_PAGE = 5
|
|
|
|
|
|
|
|
|
|
+
|
|
|
class Daili66Crawler(BaseCrawler):
|
|
class Daili66Crawler(BaseCrawler):
|
|
|
"""
|
|
"""
|
|
|
daili66 crawler, http://www.66ip.cn/1.html
|
|
daili66 crawler, http://www.66ip.cn/1.html
|
|
|
"""
|
|
"""
|
|
|
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
|
|
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
|
|
|
|
|
|
|
|
-
|
|
|
|
|
def parse(self, html):
|
|
def parse(self, html):
|
|
|
"""
|
|
"""
|
|
|
parse html file to get proxies
|
|
parse html file to get proxies
|
|
@@ -25,6 +25,7 @@ class Daili66Crawler(BaseCrawler):
|
|
|
port = int(tr.find('td:nth-child(2)').text())
|
|
port = int(tr.find('td:nth-child(2)').text())
|
|
|
yield Proxy(host=host, port=port)
|
|
yield Proxy(host=host, port=port)
|
|
|
|
|
|
|
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
crawler = Daili66Crawler()
|
|
crawler = Daili66Crawler()
|
|
|
for proxy in crawler.crawl():
|
|
for proxy in crawler.crawl():
|