|
|
@@ -31,6 +31,8 @@ from selenium.webdriver import ActionChains
|
|
|
from selenium.webdriver.common.by import By
|
|
|
import undetected_chromedriver as uc
|
|
|
import random
|
|
|
+# import pandas as pd
|
|
|
+from openpyxl import load_workbook, Workbook
|
|
|
# import numpy
|
|
|
import csv
|
|
|
import os
|
|
|
@@ -40,15 +42,16 @@ from PIL import Image
|
|
|
# import uuid
|
|
|
from threading import Thread, Event
|
|
|
from myChrome import MyChrome
|
|
|
-from utils import check_pause, download_image, get_output_code, isnull
|
|
|
+from utils import check_pause, download_image, get_output_code, isnull, write_to_csv, write_to_excel
|
|
|
desired_capabilities = DesiredCapabilities.CHROME
|
|
|
desired_capabilities["pageLoadStrategy"] = "none"
|
|
|
|
|
|
|
|
|
class BrowserThread(Thread):
|
|
|
- def __init__(self, browser_t, id, service, version, event):
|
|
|
+ def __init__(self, browser_t, id, service, version, event, config):
|
|
|
Thread.__init__(self)
|
|
|
self.browser = browser_t
|
|
|
+ self.config = config
|
|
|
self.id = id
|
|
|
self.event = event
|
|
|
self.saveName = saveName
|
|
|
@@ -65,6 +68,14 @@ class BrowserThread(Thread):
|
|
|
WebDriverWait(self.browser, 10)
|
|
|
self.browser.get('about:blank')
|
|
|
self.procedure = service["graph"] # 程序执行流程
|
|
|
+ try:
|
|
|
+ self.maxViewLength = service["maxViewLength"] # 最大显示长度
|
|
|
+ except:
|
|
|
+ self.maxViewLength = 15
|
|
|
+ try:
|
|
|
+ self.outputFormat = service["outputFormat"] # 输出格式
|
|
|
+ except:
|
|
|
+ self.outputFormat = "csv"
|
|
|
try:
|
|
|
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
|
|
pass
|
|
|
@@ -88,6 +99,7 @@ class BrowserThread(Thread):
|
|
|
self.links = list(
|
|
|
filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
|
|
|
self.OUTPUT = [] # 采集的数据
|
|
|
+ self.OUTPUT.append([]) # 添加表头
|
|
|
self.containJudge = service["containJudge"] # 是否含有判断语句
|
|
|
tOut = service["outputParameters"] # 生成输出参数对象
|
|
|
self.outputParameters = {}
|
|
|
@@ -95,15 +107,19 @@ class BrowserThread(Thread):
|
|
|
self.log = "" # 记下现在总共开了多少个标签页
|
|
|
self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
|
|
|
self.SAVED = False # 记录是否已经存储了
|
|
|
- # 文件叠加的时候不添加表头
|
|
|
- if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
|
|
- self.OUTPUT.append([]) # 添加表头
|
|
|
for para in tOut:
|
|
|
if para["name"] not in self.outputParameters.keys():
|
|
|
self.outputParameters[para["name"]] = ""
|
|
|
self.dataNotFoundKeys[para["name"]] = False
|
|
|
- if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
|
|
- self.OUTPUT[0].append(para["name"])
|
|
|
+ # 文件叠加的时候不添加表头
|
|
|
+ if self.outputFormat == "csv":
|
|
|
+ if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
|
|
+ self.OUTPUT[0].append(para["name"])
|
|
|
+ elif self.outputFormat == "xlsx":
|
|
|
+ if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
|
|
|
+ self.OUTPUT[0].append(para["name"])
|
|
|
+ elif self.outputFormat == "mysql": # MySQL不需要表头
|
|
|
+ pass
|
|
|
self.urlId = 0 # 全局记录变量
|
|
|
self.preprocess() # 预处理,优化提取数据流程
|
|
|
|
|
|
@@ -134,6 +150,8 @@ class BrowserThread(Thread):
|
|
|
def run(self):
|
|
|
# 挨个执行程序
|
|
|
for i in range(len(self.links)):
|
|
|
+ print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
|
|
|
+ print("Executing link", i + 1, "/ ", len(self.links))
|
|
|
self.executeNode(0)
|
|
|
self.urlId = self.urlId + 1
|
|
|
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
|
|
@@ -167,11 +185,17 @@ class BrowserThread(Thread):
|
|
|
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
|
|
|
file_obj.write(self.log)
|
|
|
file_obj.close()
|
|
|
- with open("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv', 'a', encoding='utf-8-sig', newline="") as f:
|
|
|
- f_csv = csv.writer(f)
|
|
|
- for line in self.OUTPUT:
|
|
|
- f_csv.writerow(line)
|
|
|
- f.close()
|
|
|
+ if self.outputFormat == "csv":
|
|
|
+ file_name = "Data/Task_" + \
|
|
|
+ str(self.id) + "/" + self.saveName + '.csv'
|
|
|
+ write_to_csv(file_name, self.OUTPUT)
|
|
|
+ elif self.outputFormat == "xlsx":
|
|
|
+ file_name = "Data/Task_" + \
|
|
|
+ str(self.id) + "/" + self.saveName + '.xlsx'
|
|
|
+ write_to_excel(file_name, self.OUTPUT)
|
|
|
+ elif self.outputFormat == "mysql":
|
|
|
+ # write_to_mysql(self.config, )
|
|
|
+ pass
|
|
|
self.OUTPUT = []
|
|
|
self.log = ""
|
|
|
|
|
|
@@ -302,7 +326,7 @@ class BrowserThread(Thread):
|
|
|
line = []
|
|
|
for value in self.outputParameters.values():
|
|
|
line.append(value)
|
|
|
- print(value[:15], " ", end="")
|
|
|
+ print(value[:self.maxViewLength], " ", end="")
|
|
|
print("")
|
|
|
self.OUTPUT.append(line)
|
|
|
|
|
|
@@ -728,6 +752,9 @@ class BrowserThread(Thread):
|
|
|
self.browser.execute_script('window.stop()')
|
|
|
except:
|
|
|
pass
|
|
|
+ except Exception as e:
|
|
|
+ print("Failed to load page: " + url)
|
|
|
+ self.recordLog('Failed to load page: ' + url)
|
|
|
try:
|
|
|
self.history["index"] = self.browser.execute_script(
|
|
|
"return history.length")
|
|
|
@@ -1184,7 +1211,7 @@ class BrowserThread(Thread):
|
|
|
line = []
|
|
|
for value in self.outputParameters.values():
|
|
|
line.append(value)
|
|
|
- print(value[:15], " ", end="")
|
|
|
+ print(value[:self.maxViewLength], " ", end="")
|
|
|
print("")
|
|
|
self.OUTPUT.append(line)
|
|
|
# rt.end()
|
|
|
@@ -1279,12 +1306,15 @@ if __name__ == '__main__':
|
|
|
# 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
|
|
|
# 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
|
|
|
# 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
|
|
|
- if c.user_data:
|
|
|
+ try:
|
|
|
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
|
|
config = json.load(f)
|
|
|
absolute_user_data_folder = config["absolute_user_data_folder"]
|
|
|
print("\nAbsolute_user_data_folder:",
|
|
|
absolute_user_data_folder, "\n")
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ if c.user_data:
|
|
|
option.add_argument(
|
|
|
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
|
|
option.add_argument("--profile-directory=Default")
|
|
|
@@ -1371,7 +1401,8 @@ if __name__ == '__main__':
|
|
|
print("过Cloudflare验证模式")
|
|
|
event = Event()
|
|
|
event.set()
|
|
|
- thread = BrowserThread(browser_t, i, service, c.version, event)
|
|
|
+ thread = BrowserThread(browser_t, i, service,
|
|
|
+ c.version, event, config=config)
|
|
|
print("Thread with task id: ", i, " is created")
|
|
|
threads.append(thread)
|
|
|
thread.start()
|