easyspider_executestage.py 136 KB


  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. import atexit
  4. import copy
  5. import platform
  6. import shutil
  7. import string
  8. import threading
  9. # import undetected_chromedriver as uc
  10. from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
  11. on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
  12. from myChrome import MyChrome
  13. from threading import Thread, Event
  14. from PIL import Image
  15. from commandline_config import Config
  16. import os
  17. import csv
  18. from openpyxl import load_workbook, Workbook
  19. import random
  20. from selenium.webdriver import ActionChains
  21. from selenium.webdriver.support.ui import Select
  22. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  23. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  24. from selenium.common.exceptions import TimeoutException
  25. from selenium.common.exceptions import NoSuchElementException
  26. from selenium.webdriver.common.by import By
  27. from selenium.webdriver.support import expected_conditions as EC
  28. from selenium.webdriver.support.ui import WebDriverWait
  29. from selenium import webdriver
  30. from selenium.webdriver.common.action_chains import ActionChains
  31. from selenium.webdriver.common.keys import Keys
  32. from selenium.webdriver.chrome.options import Options
  33. from selenium.webdriver.chrome.service import Service
  34. from datetime import datetime
  35. import io # 遇到错误退出时应执行的代码
  36. import json
  37. # from lib2to3.pgen2 import driver
  38. import re
  39. # import shutil
  40. import subprocess
  41. import sys
  42. # from urllib import parse
  43. # import base64
  44. # import hashlib
  45. import time
  46. import requests
  47. from multiprocessing import freeze_support
  48. freeze_support() # 防止无限死循环多开
  49. try:
  50. from ddddocr import DdddOcr
  51. import onnxruntime
  52. onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
  53. except:
  54. print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
  55. print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
  56. from urllib.parse import urljoin
  57. from lxml import etree, html
  58. try:
  59. import pandas as pd
  60. except:
  61. print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
  62. print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
  63. time.sleep(1)
  64. # import numpy
  65. # import pytesseract
  66. # import uuid
  67. if sys.platform != "darwin":
  68. from myChrome import MyUCChrome
  69. desired_capabilities = DesiredCapabilities.CHROME
  70. desired_capabilities["pageLoadStrategy"] = "none"
  71. class BrowserThread(Thread):
  72. def __init__(self, browser_t, id, service, version, event, saveName, config, option):
  73. Thread.__init__(self)
  74. self.logs = io.StringIO()
  75. try:
  76. self.log = bool(service["recordLog"])
  77. except:
  78. self.log = True
  79. self.browser = browser_t
  80. self.option = option
  81. self.config = config
  82. self.version = version
  83. self.totalSteps = 0
  84. self.id = id
  85. self.event = event
  86. try:
  87. self.saveName = service["saveName"] # 保存文件的名字
  88. except:
  89. now = datetime.now()
  90. # 将时间格式化为精确到秒的字符串
  91. self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
  92. self.OUTPUT = ""
  93. self.SAVED = False
  94. self.BREAK = False
  95. self.CONTINUE = False
  96. try:
  97. maximizeWindow = service["maximizeWindow"]
  98. except:
  99. maximizeWindow = 0
  100. if maximizeWindow == 1:
  101. self.browser.maximize_window()
  102. # 名称设定
  103. if saveName != "": # 命令行覆盖保存名称
  104. self.saveName = saveName # 保存文件的名字
  105. now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  106. self.saveName = self.saveName.replace("current_time", now)
  107. self.print_and_log("任务ID", id, "的保存文件名为:", self.saveName)
  108. self.print_and_log("Save Name for task ID", id, "is:", self.saveName)
  109. if not os.path.exists("Data/Task_" + str(id)):
  110. os.mkdir("Data/Task_" + str(id))
  111. self.downloadFolder = "Data/Task_" + str(id) + "/" + self.saveName
  112. if not os.path.exists(self.downloadFolder):
  113. os.mkdir(self.downloadFolder) # 创建保存文件夹用来保存截图和文件
  114. if not os.path.exists(self.downloadFolder + "/files"):
  115. os.mkdir(self.downloadFolder + "/files")
  116. if not os.path.exists(self.downloadFolder + "/images"):
  117. os.mkdir(self.downloadFolder + "/images")
  118. self.getDataStep = 0
  119. self.startSteps = 0
  120. try:
  121. startFromExit = service["startFromExit"] # 从上次退出的步骤开始
  122. if startFromExit == 1:
  123. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
  124. encoding='utf-8-sig') as file_obj:
  125. self.startSteps = int(file_obj.read()) # 读取已执行步数
  126. except:
  127. pass
  128. if self.startSteps != 0:
  129. self.print_and_log("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
  130. self.startSteps, "条。")
  131. self.print_and_log("In this mode, task ID", self.id,
  132. "will start from the last step, before we already collected", self.startSteps, " items.")
  133. else:
  134. self.print_and_log("此模式下,任务ID", self.id,
  135. "将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
  136. self.print_and_log("In this mode, task ID", self.id,
  137. "will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
  138. stealth_path = driver_path[:driver_path.find(
  139. "chromedriver")] + "stealth.min.js"
  140. with open(stealth_path, 'r') as f:
  141. js = f.read()
  142. self.print_and_log("Loading stealth.min.js")
  143. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  144. 'source': js}) # TMALL 反扒
  145. self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  146. "source": """
  147. Object.defineProperty(navigator, 'webdriver', {
  148. get: () => undefined
  149. })
  150. """
  151. })
  152. WebDriverWait(self.browser, 10)
  153. self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
  154. path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id), self.saveName, "files")
  155. self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
  156. self.browser.execute("send_command", self.paramss) # 下载目录改变
  157. self.monitor_event = threading.Event()
  158. self.monitor_thread = threading.Thread(target=rename_downloaded_file, args=(path, self.monitor_event)) #path后面的逗号不能省略,是元组固定写法
  159. self.monitor_thread.start()
  160. # self.browser.get('about:blank')
  161. self.procedure = service["graph"] # 程序执行流程
  162. try:
  163. self.maxViewLength = service["maxViewLength"] # 最大显示长度
  164. except:
  165. self.maxViewLength = 15
  166. try:
  167. self.outputFormat = service["outputFormat"] # 输出格式
  168. except:
  169. self.outputFormat = "csv"
  170. try:
  171. self.task_version = service["version"] # 任务版本
  172. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  173. pass
  174. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  175. if service["version"] != version:
  176. self.print_and_log("版本不一致,请使用" +
  177. service["version"] + "版本的EasySpider运行该任务!")
  178. self.print_and_log("Version not match, please use EasySpider " +
  179. service["version"] + " to run this task!")
  180. self.browser.quit()
  181. sys.exit()
  182. except: # 0.2.0版本没有version字段,所以直接退出
  183. self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  184. self.print_and_log(
  185. "Version not match, please use EasySpider v0.2.0 to run this task!")
  186. self.browser.quit()
  187. sys.exit()
  188. try:
  189. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  190. except:
  191. self.save_threshold = 10
  192. try:
  193. self.links = list(
  194. filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
  195. except:
  196. self.links = list(filter(isnotnull, service["url"])) # 要执行的link
  197. self.OUTPUT = [] # 采集的数据
  198. try:
  199. self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖,3为重命名文件
  200. except:
  201. self.dataWriteMode = 1
  202. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
  203. if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  204. if self.dataWriteMode == 2:
  205. os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
  206. elif self.dataWriteMode == 3:
  207. i = 2
  208. while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
  209. i = i + 1
  210. self.saveName = self.saveName + '_' + str(i)
  211. self.print_and_log("文件已存在,已重命名为", self.saveName)
  212. self.writeMode = 1 # 写入模式,0为新建,1为追加
  213. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  214. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  215. self.OUTPUT.append([]) # 添加表头
  216. self.writeMode = 0
  217. elif self.outputFormat == "json":
  218. self.writeMode = 3 # JSON模式无需判断是否存在文件
  219. elif self.outputFormat == "mysql":
  220. self.mysql = myMySQL(config["mysql_config_path"])
  221. self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
  222. self.writeMode = 2
  223. if self.writeMode == 0:
  224. self.print_and_log("新建模式|Create Mode")
  225. elif self.writeMode == 1:
  226. self.print_and_log("追加模式|Append Mode")
  227. elif self.writeMode == 2:
  228. self.print_and_log("MySQL模式|MySQL Mode")
  229. elif self.writeMode == 3:
  230. self.print_and_log("JSON模式|JSON Mode")
  231. self.containJudge = service["containJudge"] # 是否含有判断语句
  232. self.outputParameters = {}
  233. self.service = service
  234. self.outputParametersTypes = []
  235. self.outputParametersRecord = [] # 字段是否被记录
  236. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  237. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  238. self.SAVED = False # 记录是否已经存储了
  239. for param in service["outputParameters"]: # 初始化输出参数
  240. if param["name"] not in self.outputParameters.keys():
  241. self.outputParameters[param["name"]] = ""
  242. self.dataNotFoundKeys[param["name"]] = False
  243. try:
  244. self.outputParametersTypes.append(param["type"])
  245. except:
  246. self.outputParametersTypes.append("text")
  247. try:
  248. self.outputParametersRecord.append(
  249. bool(param["recordASField"]))
  250. except:
  251. self.outputParametersRecord.append(True)
  252. # 文件叠加的时候不添加表头
  253. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  254. if self.writeMode == 0:
  255. self.OUTPUT[0].append(param["name"])
  256. self.urlId = 0 # 全局记录变量
  257. self.preprocess() # 预处理,优化提取数据流程
  258. try:
  259. self.inputExcel = service["inputExcel"] # 输入Excel
  260. except:
  261. self.inputExcel = ""
  262. self.readFromExcel() # 读取Excel获得参数值
  263. # 检测如果没有复杂的操作,优化提取数据流程
  264. def preprocess(self):
  265. for node in self.procedure:
  266. try:
  267. iframe = node["parameters"]["iframe"]
  268. except:
  269. node["parameters"]["iframe"] = False
  270. try:
  271. node["parameters"]["xpath"] = lowercase_tags_in_xpath(
  272. node["parameters"]["xpath"])
  273. except:
  274. pass
  275. try:
  276. node["parameters"]["waitElementIframeIndex"] = int(
  277. node["parameters"]["waitElementIframeIndex"])
  278. except:
  279. node["parameters"]["waitElement"] = ""
  280. node["parameters"]["waitElementTime"] = 10
  281. node["parameters"]["waitElementIframeIndex"] = 0
  282. if node["option"] == 1: # 打开网页操作
  283. try:
  284. cookies = node["parameters"]["cookies"]
  285. except:
  286. node["parameters"]["cookies"] = ""
  287. elif node["option"] == 2: # 点击操作
  288. try:
  289. alertHandleType = node["parameters"]["alertHandleType"]
  290. except:
  291. node["parameters"]["alertHandleType"] = 0
  292. if node["parameters"]["useLoop"]:
  293. if self.task_version <= "0.3.5":
  294. # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
  295. node["parameters"]["xpath"] = ""
  296. self.print_and_log("您的任务版本号为" + self.task_version +
  297. ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
  298. elif node["option"] == 3: # 提取数据操作
  299. node["parameters"]["recordASField"] = 0
  300. try:
  301. params = node["parameters"]["params"]
  302. except:
  303. node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
  304. params = node["parameters"]["params"]
  305. try:
  306. clear = node["parameters"]["clear"]
  307. except:
  308. node["parameters"]["clear"] = 0
  309. try:
  310. newLine = node["parameters"]["newLine"]
  311. except:
  312. node["parameters"]["newLine"] = 1
  313. for param in params:
  314. try:
  315. iframe = param["iframe"]
  316. except:
  317. param["iframe"] = False
  318. try:
  319. param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
  320. except:
  321. pass
  322. try:
  323. node["parameters"]["recordASField"] = param["recordASField"]
  324. except:
  325. node["parameters"]["recordASField"] = 1
  326. try:
  327. splitLine = int(param["splitLine"])
  328. except:
  329. param["splitLine"] = 0
  330. if param["contentType"] == 8:
  331. self.print_and_log(
  332. "默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
  333. self.print_and_log(
  334. "If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
  335. param["optimizable"] = detect_optimizable(param)
  336. elif node["option"] == 4: # 输入文字
  337. try:
  338. index = node["parameters"]["index"] # 索引值
  339. except:
  340. node["parameters"]["index"] = 0
  341. elif node["option"] == 5: # 自定义操作
  342. try:
  343. clear = node["parameters"]["clear"]
  344. except:
  345. node["parameters"]["clear"] = 0
  346. try:
  347. newLine = node["parameters"]["newLine"]
  348. except:
  349. node["parameters"]["newLine"] = 1
  350. elif node["option"] == 7: # 移动到元素
  351. if node["parameters"]["useLoop"]:
  352. if self.task_version <= "0.3.5":
  353. # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
  354. node["parameters"]["xpath"] = ""
  355. self.print_and_log("您的任务版本号为" + self.task_version +
  356. ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
  357. elif node["option"] == 8: # 循环操作
  358. try:
  359. exitElement = node["parameters"]["exitElement"]
  360. if exitElement == "":
  361. node["parameters"]["exitElement"] = "//body"
  362. except:
  363. node["parameters"]["exitElement"] = "//body"
  364. node["parameters"]["quickExtractable"] = False # 是否可以快速提取
  365. try:
  366. skipCount = node["parameters"]["skipCount"]
  367. except:
  368. node["parameters"]["skipCount"] = 0
  369. # 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
  370. if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
  371. try:
  372. params = self.procedure[node["sequence"][0]]["parameters"]["params"]
  373. except:
  374. params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
  375. try:
  376. waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
  377. except:
  378. waitElement = ""
  379. if node["parameters"]["iframe"]:
  380. node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
  381. else:
  382. node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
  383. if node["parameters"]["skipCount"] > 0:
  384. node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
  385. for param in params:
  386. optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
  387. try:
  388. iframe = param["iframe"]
  389. except:
  390. param["iframe"] = False
  391. if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
  392. optimizable = False
  393. if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
  394. node["parameters"]["quickExtractable"] = False
  395. break
  396. if node["parameters"]["quickExtractable"]:
  397. self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
  398. self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
  399. try:
  400. node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
  401. except:
  402. node["parameters"]["clear"] = 0
  403. try:
  404. node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
  405. except:
  406. node["parameters"]["newLine"] = 1
  407. if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  408. node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
  409. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  410. node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
  411. node["parameters"]["quickParams"] = []
  412. for param in params:
  413. content_type = ""
  414. if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
  415. "::text()") >= 0:
  416. content_type = ""
  417. elif param["nodeType"] == 2:
  418. content_type = "//@href"
  419. elif param["nodeType"] == 4: # 图片链接
  420. content_type = "//@src"
  421. elif param["contentType"] == 1:
  422. content_type = "/text()"
  423. elif param["contentType"] == 0:
  424. content_type = "//text()"
  425. if param["relative"]: # 如果是相对XPath
  426. xpath = "." + param["relativeXPath"] + content_type
  427. else:
  428. xpath = param["relativeXPath"] + content_type
  429. # 如果是id()或(//div)[1]这种形式,不需要包/html/body
  430. # if xpath.find("/body") < 0 and xpath.startswith("/"):
  431. # xpath = "/html/body" + xpath
  432. node["parameters"]["quickParams"].append({
  433. "name": param["name"],
  434. "relative": param["relative"],
  435. "xpath": xpath,
  436. "nodeType": param["nodeType"],
  437. "default": param["default"],
  438. })
  439. self.print_and_log("预处理完成|Preprocess completed")
  440. def readFromExcel(self):
  441. if self.inputExcel == "":
  442. return 0
  443. try:
  444. workbook = load_workbook(self.inputExcel)
  445. except:
  446. self.print_and_log("读取Excel失败,将会使用默认参数执行任务,请检查文件路径是否正确:",
  447. os.path.abspath(self.inputExcel))
  448. self.print_and_log(
  449. "Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ",
  450. os.path.abspath(self.inputExcel))
  451. time.sleep(5)
  452. return 0
  453. sheet_name_list = workbook.sheetnames
  454. sheet = workbook[sheet_name_list[0]]
  455. data = []
  456. for row in sheet.iter_rows(values_only=True):
  457. data.append(list(row))
  458. result = list(zip(*data))
  459. result_dict = {}
  460. for row in result:
  461. key = row[0]
  462. values = [str(val) for val in row[1:] if val is not None]
  463. result_dict.setdefault(key, []).extend([values])
  464. data = {}
  465. for key, arr in result_dict.items():
  466. result = []
  467. for cols in zip(*arr):
  468. result.append("~".join(cols))
  469. data[key] = result
  470. try:
  471. if "urlList_0" in data.keys():
  472. self.links = data["urlList_0"]
  473. except:
  474. self.links = "about:blank"
  475. task = self.service
  476. for key, value in data.items():
  477. for i in range(len(task["inputParameters"])):
  478. if key == task["inputParameters"][i]["name"]:
  479. nodeId = int(task["inputParameters"][i]["nodeId"])
  480. node = task["graph"][nodeId]
  481. value = "\r\n".join(value)
  482. if node["option"] == 1:
  483. node["parameters"]["links"] = value
  484. elif node["option"] == 4:
  485. node["parameters"]["value"] = value
  486. elif node["option"] == 8 and node["parameters"]["loopType"] == 0:
  487. node["parameters"]["exitCount"] = int(value)
  488. elif node["option"] == 8:
  489. node["parameters"]["textList"] = value
  490. break
  491. self.print_and_log("已从Excel读取输入参数,覆盖了原有输入参数。")
  492. self.print_and_log(
  493. "Already read input parameters from Excel and overwrite the original input parameters.")
  494. def removeDuplicateData(self):
  495. try:
  496. removeDuplicateData = self.service["removeDuplicate"]
  497. except:
  498. removeDuplicateData = 0
  499. if removeDuplicateData == 1:
  500. self.print_and_log("正在去除重复数据,请稍后……")
  501. self.print_and_log("Removing duplicate data, please wait...")
  502. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "json" or self.outputFormat == "xlsx":
  503. file_name = "Data/Task_" + \
  504. str(self.id) + "/" + self.saveName + \
  505. '.' + self.outputFormat
  506. if self.outputFormat == "csv" or self.outputFormat == "txt":
  507. df = pd.read_csv(file_name)
  508. df.drop_duplicates(inplace=True)
  509. df.to_csv(file_name, index=False)
  510. elif self.outputFormat == "xlsx":
  511. df = pd.read_excel(file_name)
  512. df.drop_duplicates(inplace=True)
  513. df.to_excel(file_name, index=False)
  514. elif self.outputFormat == "json":
  515. df = pd.read_json(file_name)
  516. df.drop_duplicates(inplace=True)
  517. df.to_json(file_name, orient="records", force_ascii=False)
  518. elif self.outputFormat == "mysql":
  519. self.mysql.remove_duplicate_data()
  520. self.print_and_log("去重完成。")
  521. self.print_and_log("Duplicate data removed.")
  522. def run(self):
  523. # 挨个执行程序
  524. for i in range(len(self.links)):
  525. self.print_and_log("正在执行第", i + 1, "/", len(self.links), "个链接")
  526. self.print_and_log("Executing link", i + 1,
  527. "/", len(self.links))
  528. self.executeNode(0)
  529. self.urlId = self.urlId + 1
  530. # files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  531. # 如果目录为空,则删除该目录
  532. # if not files:
  533. # os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  534. self.print_and_log("Done!")
  535. self.print_and_log("执行完成!")
  536. self.saveData(exit=True)
  537. self.removeDuplicateData()
  538. if self.outputFormat == "mysql":
  539. self.mysql.close()
  540. try:
  541. quitWaitTime = self.service["quitWaitTime"]
  542. except:
  543. quitWaitTime = 60
  544. self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
  545. self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
  546. time.sleep(quitWaitTime)
  547. try:
  548. self.browser.quit()
  549. except:
  550. pass
  551. self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
  552. try:
  553. shutil.rmtree(self.option["tmp_user_data_folder"])
  554. except:
  555. pass
  556. self.monitor_event.set()
  557. self.print_and_log("清理完成!|Clean up completed!")
  558. self.print_and_log("您现在可以安全的关闭此窗口了。|You can safely close this window now.")
  559. def recordLog(self, *args, **kwargs):
  560. now = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
  561. print(now + ":", *args, file=self.logs, **kwargs)
  562. # 定义一个自定义的 print 函数,它将内容同时打印到屏幕和文件中
  563. def print_and_log(self, *args, **kwargs):
  564. now = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
  565. # 将内容打印到屏幕
  566. print(*args, **kwargs)
  567. # 将内容写入文件
  568. print(now + ":", *args, file=self.logs, **kwargs)
  569. def saveData(self, exit=False):
  570. # 每save_threshold条保存一次
  571. if exit == True or len(self.OUTPUT) >= self.save_threshold:
  572. # 写入日志
  573. # self.recordLog("持久化存储数据/Persistently store data")
  574. if self.log:
  575. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '.log', 'a',
  576. encoding='utf-8-sig') as file_obj:
  577. file_obj.write(self.logs.getvalue())
  578. file_obj.close()
  579. # 写入已执行步数
  580. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'w',
  581. encoding='utf-8-sig') as file_obj:
  582. file_obj.write(str(self.totalSteps + 1))
  583. file_obj.close()
  584. # 写入数据
  585. if self.outputFormat == "csv" or self.outputFormat == "txt":
  586. file_name = "Data/Task_" + \
  587. str(self.id) + "/" + self.saveName + \
  588. '.' + self.outputFormat
  589. write_to_csv(file_name, self.OUTPUT,
  590. self.outputParametersRecord)
  591. elif self.outputFormat == "xlsx":
  592. file_name = "Data/Task_" + \
  593. str(self.id) + "/" + self.saveName + '.xlsx'
  594. write_to_excel(
  595. file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
  596. elif self.outputFormat == "json":
  597. file_name = "Data/Task_" + \
  598. str(self.id) + "/" + self.saveName + '.json'
  599. write_to_json(file_name, self.OUTPUT, self.outputParametersTypes,
  600. self.outputParametersRecord, self.outputParameters.keys())
  601. elif self.outputFormat == "mysql":
  602. self.mysql.write_to_mysql(
  603. self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
  604. self.OUTPUT = []
  605. self.logs.truncate(0) # 清空日志
  606. self.logs.seek(0) # 清空日志
  607. def scrollDown(self, param, rt=""):
  608. try:
  609. time.sleep(param["scrollWaitTime"]) # 下拉前等待
  610. except:
  611. pass
  612. scrollType = int(param["scrollType"])
  613. try:
  614. param["scrollCount"] = int(param["scrollCount"])
  615. except:
  616. param["scrollCount"] = 1
  617. try:
  618. if scrollType != 0 and param["scrollCount"] > 0: # 控制屏幕向下滚动
  619. if scrollType == 1 or scrollType == 2:
  620. for i in range(param["scrollCount"]):
  621. body = self.browser.find_element(
  622. By.CSS_SELECTOR, "body", iframe=param["iframe"])
  623. if scrollType == 1:
  624. body.send_keys(Keys.PAGE_DOWN)
  625. elif scrollType == 2:
  626. body.send_keys(Keys.END)
  627. try:
  628. time.sleep(param["scrollWaitTime"]) # 下拉完等待
  629. except:
  630. pass
  631. self.print_and_log("向下滚动,第", i + 1, "次。")
  632. self.print_and_log(
  633. "Scroll down, the", i + 1, "time.")
  634. elif scrollType == 3:
  635. bodyText = ""
  636. i = 0
  637. while True:
  638. newBodyText = self.browser.find_element(
  639. By.CSS_SELECTOR, "body", iframe=False).text
  640. if param["iframe"]: # 如果标记了iframe
  641. iframes = self.browser.find_elements(
  642. By.CSS_SELECTOR, "iframe", iframe=False)
  643. for iframe in iframes:
  644. self.browser.switch_to.default_content()
  645. self.browser.switch_to.frame(iframe)
  646. iframe_text = super(self.browser.__class__, self.browser).find_element(
  647. By.CSS_SELECTOR, "body").text # 用super调用父类的方法
  648. newBodyText += iframe_text
  649. self.browser.switch_to.default_content()
  650. if newBodyText == bodyText:
  651. self.print_and_log("页面已检测不到新内容,停止滚动。")
  652. self.print_and_log(
  653. "No new content detected on the page, stop scrolling.")
  654. break
  655. else:
  656. bodyText = newBodyText
  657. body = self.browser.find_element(
  658. By.CSS_SELECTOR, "body", iframe=param["iframe"])
  659. body.send_keys(Keys.END)
  660. self.print_and_log("滚动到底部,第", i + 1, "次。")
  661. self.print_and_log(
  662. "Scroll to the bottom, the", i + 1, "time.")
  663. i = i + 1
  664. try:
  665. time.sleep(param["scrollWaitTime"]) # 下拉完等待
  666. except:
  667. pass
  668. except Exception as e:
  669. self.print_and_log("滚动屏幕时出错|Error scrolling screen:", e)
  670. try:
  671. self.browser.execute_script('window.stop()')
  672. except:
  673. pass
  674. if scrollType != 0 and param["scrollCount"] > 0: # 控制屏幕向下滚动
  675. if scrollType == 1 or scrollType == 2:
  676. for i in range(param["scrollCount"]):
  677. body = self.browser.find_element(
  678. By.CSS_SELECTOR, "body", iframe=param["iframe"])
  679. if scrollType == 1:
  680. body.send_keys(Keys.PAGE_DOWN)
  681. elif scrollType == 2:
  682. body.send_keys(Keys.END)
  683. try:
  684. time.sleep(param["scrollWaitTime"]) # 下拉完等待
  685. except:
  686. pass
  687. self.print_and_log("向下滚动,第", i + 1, "次。")
  688. self.print_and_log(
  689. "Scroll down, the", i + 1, "time.")
  690. elif scrollType == 3:
  691. bodyText = ""
  692. i = 0
  693. while True:
  694. newBodyText = self.browser.find_element(
  695. By.CSS_SELECTOR, "body", iframe=False).text
  696. if param["iframe"]: # 如果标记了iframe
  697. iframes = self.browser.find_elements(
  698. By.CSS_SELECTOR, "iframe", iframe=False)
  699. for iframe in iframes:
  700. self.browser.switch_to.default_content()
  701. self.browser.switch_to.frame(iframe)
  702. iframe_text = super(self.browser.__class__, self.browser).find_element(
  703. By.CSS_SELECTOR, "body").text # 用super调用父类的方法
  704. newBodyText += iframe_text
  705. self.browser.switch_to.default_content()
  706. if newBodyText == bodyText:
  707. self.print_and_log("页面已检测不到新内容,停止滚动。")
  708. self.print_and_log(
  709. "No new content detected on the page, stop scrolling.")
  710. break
  711. else:
  712. bodyText = newBodyText
  713. body = self.browser.find_element(
  714. By.CSS_SELECTOR, "body", iframe=param["iframe"])
  715. body.send_keys(Keys.END)
  716. self.print_and_log("滚动到底部,第", i + 1, "次。")
  717. self.print_and_log(
  718. "Scroll to the bottom, the", i + 1, "time.")
  719. i = i + 1
  720. try:
  721. time.sleep(param["scrollWaitTime"]) # 下拉完等待
  722. except:
  723. pass
  724. if rt != "":
  725. rt.end()
  726. def execute_code(self, codeMode, code, max_wait_time, element=None, iframe=False):
  727. output = ""
  728. if code == "":
  729. return ""
  730. if max_wait_time == 0:
  731. max_wait_time = 999999
  732. # self.print_and_log(codeMode, code)
  733. # 将value中的Field[""]替换为outputParameters中的键值
  734. code = replace_field_values(code, self.outputParameters, self)
  735. if iframe and self.browser.iframe_env == False:
  736. # 获取所有的 iframe
  737. self.browser.switch_to.default_content()
  738. iframes = self.browser.find_elements(
  739. By.CSS_SELECTOR, "iframe", iframe=False)
  740. # 遍历所有的 iframe 并点击里面的元素
  741. for iframe in iframes:
  742. # 切换到 iframe
  743. try:
  744. self.browser.switch_to.default_content()
  745. self.browser.switch_to.frame(iframe)
  746. self.browser.iframe_env = True
  747. break
  748. except:
  749. self.print_and_log("Iframe switch failed")
  750. elif not iframe and self.browser.iframe_env == True:
  751. self.browser.switch_to.default_content()
  752. self.browser.iframe_env = False
  753. if int(codeMode) == 0:
  754. self.recordLog("Execute JavaScript:" + code)
  755. self.recordLog("执行JavaScript:" + code)
  756. self.browser.set_script_timeout(max_wait_time)
  757. try:
  758. output = self.browser.execute_script(code)
  759. except:
  760. output = ""
  761. self.recordLog("JavaScript execution failed")
  762. elif int(codeMode) == 2:
  763. self.recordLog("Execute JavaScript for element:" + code)
  764. self.recordLog("对元素执行JavaScript:" + code)
  765. self.browser.set_script_timeout(max_wait_time)
  766. try:
  767. output = self.browser.execute_script(code, element)
  768. except:
  769. output = ""
  770. self.recordLog("JavaScript execution failed")
  771. elif int(codeMode) == 5:
  772. try:
  773. code = readCode(code)
  774. # global_namespace = globals().copy()
  775. # global_namespace["self"] = self
  776. output = exec(code)
  777. self.recordLog("执行下面的代码:" + code)
  778. self.recordLog("Execute the following code:" + code)
  779. except Exception as e:
  780. self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
  781. self.print_and_log("Error executing the following code:" +
  782. code, ", error is:", e)
  783. elif int(codeMode) == 6:
  784. try:
  785. code = readCode(code)
  786. output = eval(code)
  787. self.recordLog("获得下面的代码返回值:" + code)
  788. self.recordLog(
  789. "Get the return value of the following code:" + code)
  790. except Exception as e:
  791. self.print_and_log("获得下面的代码返回值时出错:" + code, ",错误为:", e)
  792. self.print_and_log(
  793. "Error executing and getting return value the following code:" + code, ", error is:", e)
  794. elif int(codeMode) == 1:
  795. self.recordLog("Execute System Call:" + code)
  796. self.recordLog("执行系统命令:" + code)
  797. # 执行系统命令
  798. try:
  799. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  800. output = subprocess.run(
  801. code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  802. # 输出命令返回值
  803. output = output.stdout
  804. self.print_and_log(output)
  805. except subprocess.TimeoutExpired:
  806. # 命令执行时间超过指定值,抛出异常
  807. self.recordLog("Command timed out")
  808. self.recordLog("命令执行超时")
  809. except Exception as e:
  810. self.print_and_log(e) # 打印异常信息
  811. self.recordLog("Command execution failed")
  812. self.recordLog("命令执行失败")
  813. try:
  814. output = str(output)
  815. except:
  816. output = "无法转换为字符串|Unable to convert to string"
  817. self.print_and_log("无法转换为字符串|Unable to convert to string", output)
  818. return output
  819. def customOperation(self, node, loopValue, loopPath, index):
  820. params = node["parameters"]
  821. if params["clear"] == 1:
  822. self.clearOutputParameters()
  823. codeMode = int(params["codeMode"])
  824. code = params["code"]
  825. output = ""
  826. max_wait_time = int(params["waitTime"])
  827. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  828. try:
  829. loopPath = replace_field_values(
  830. loopPath, self.outputParameters, self)
  831. elements = self.browser.find_elements(
  832. By.XPATH, loopPath, iframe=params["iframe"])
  833. element = elements[index]
  834. output = self.execute_code(
  835. codeMode, code, max_wait_time, element, iframe=params["iframe"])
  836. except:
  837. output = ""
  838. self.print_and_log("JavaScript execution failed")
  839. elif codeMode == 3:
  840. self.BREAK = True
  841. self.recordLog("跳出循环|Break the loop")
  842. elif codeMode == 4:
  843. self.CONTINUE = True
  844. self.recordLog("跳过本次循环|Skip this loop")
  845. elif codeMode == 7: # 暂停程序执行
  846. self.event.clear()
  847. self.print_and_log(
  848. f"根据设置的自定义操作,任务已暂停,长按{self.service['pauseKey']}键继续执行...|Task paused according to custom operation, long press '{self.service['pauseKey']}' to continue...")
  849. elif codeMode == 8: # 刷新页面
  850. self.browser.refresh()
  851. self.print_and_log("根据设置的自定义操作,任务已刷新页面|Task refreshed page according to custom operation")
  852. elif codeMode == 9: # 发送邮件
  853. send_email(node["parameters"]["emailConfig"])
  854. elif codeMode == 10: # 清空所有字段值
  855. self.clearOutputParameters()
  856. elif codeMode == 11: # 生成新的数据行
  857. line = new_line(self.outputParameters,
  858. self.maxViewLength, self.outputParametersRecord)
  859. self.OUTPUT.append(line)
  860. elif codeMode == 12: # 退出程序
  861. self.print_and_log("根据设置的自定义操作,任务已退出|Task exited according to custom operation")
  862. self.saveData(exit=True)
  863. self.browser.quit()
  864. self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
  865. try:
  866. shutil.rmtree(self.option["tmp_user_data_folder"])
  867. except:
  868. pass
  869. self.print_and_log("清理完成!|Clean up completed!")
  870. os._exit(0)
  871. else: # 0 1 5 6
  872. output = self.execute_code(
  873. codeMode, code, max_wait_time, iframe=params["iframe"])
  874. recordASField = bool(params["recordASField"])
  875. # if recordASField:
  876. # self.print_and_log("操作<" + node["title"] + ">的返回值为:" + output)
  877. # self.print_and_log("The return value of operation <" + node["title"] + "> is: " + output)
  878. self.outputParameters[node["title"]] = output
  879. if recordASField and params["newLine"]:
  880. line = new_line(self.outputParameters,
  881. self.maxViewLength, self.outputParametersRecord)
  882. self.OUTPUT.append(line)
  883. def switchSelect(self, param, loopValue):
  884. optionMode = param["optionMode"]
  885. optionValue = param["optionValue"]
  886. if param["useLoop"]:
  887. index = param["index"]
  888. if index != 0:
  889. try:
  890. optionValue = loopValue.split("~")[index - 1]
  891. except:
  892. self.print_and_log("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
  893. self.print_and_log(
  894. "Failed to get value, maybe because the index is out of range, will use the entire text value")
  895. else:
  896. optionValue = loopValue
  897. optionMode = 1
  898. try:
  899. xpath = replace_field_values(
  900. param["xpath"], self.outputParameters, self)
  901. dropdown = Select(self.browser.find_element(
  902. By.XPATH, xpath, iframe=param["iframe"]))
  903. try:
  904. if optionMode == 0:
  905. # 获取当前选中的选项索引
  906. current_index = dropdown.options.index(
  907. dropdown.first_selected_option)
  908. # 计算下一个选项的索引
  909. next_index = (current_index + 1) % len(dropdown.options)
  910. # 选择下一个选项
  911. dropdown.select_by_index(next_index)
  912. elif optionMode == 1:
  913. dropdown.select_by_index(int(optionValue))
  914. elif optionMode == 2:
  915. dropdown.select_by_value(optionValue)
  916. elif optionMode == 3:
  917. dropdown.select_by_visible_text(optionValue)
  918. # self.recordLog("切换到下拉框选项|Change to drop-down box option:", xpath)
  919. except:
  920. self.print_and_log("切换下拉框选项失败:", xpath,
  921. param["optionMode"], param["optionValue"])
  922. self.print_and_log("Failed to change drop-down box option:",
  923. xpath, param["optionMode"], param["optionValue"])
  924. except:
  925. self.print_and_log("找不到下拉框元素:", xpath)
  926. self.print_and_log("Cannot find drop-down box element:", xpath)
  927. def moveToElement(self, param, loopElement=None, loopPath="", index=0):
  928. time.sleep(0.1) # 移动之前等待0.1秒
  929. loopPath = replace_field_values(loopPath, self.outputParameters, self)
  930. xpath = replace_field_values(
  931. param["xpath"], self.outputParameters, self)
  932. if param["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  933. if xpath == "":
  934. path = loopPath
  935. else:
  936. path = "(" + loopPath + ")" + \
  937. "[" + str(index + 1) + "]" + \
  938. xpath
  939. index = 0 # 如果是相对循环内元素的点击,在定位到元素后,index应该重置为0
  940. # element = loopElement
  941. else:
  942. index = 0
  943. path = xpath # 不然使用元素定义的xpath
  944. path = replace_field_values(path, self.outputParameters, self)
  945. try:
  946. elements = self.browser.find_elements(
  947. By.XPATH, path, iframe=param["iframe"])
  948. element = elements[index]
  949. try:
  950. ActionChains(self.browser).move_to_element(element).perform()
  951. # self.recordLog("移动到元素|Move to element:", path)
  952. except:
  953. self.print_and_log("移动鼠标到元素失败:", xpath)
  954. self.print_and_log("Failed to move mouse to element:", xpath)
  955. except:
  956. self.print_and_log("找不到元素:", xpath)
  957. self.print_and_log("Cannot find element:", xpath)
  958. # 执行节点关键函数部分
  959. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  960. node = self.procedure[nodeId]
  961. # WebDriverWait(self.browser, 10).until
  962. # # 等待元素出现才进行操作,10秒内未出现则报错
  963. # (EC.visibility_of_element_located(
  964. # (By.XPATH, node["parameters"]["xpath"])))
  965. try:
  966. if node["parameters"]["waitElement"] != "":
  967. waitElement = replace_field_values(
  968. node["parameters"]["waitElement"], self.outputParameters, self)
  969. waitElementTime = float(node["parameters"]["waitElementTime"])
  970. waitElementIframeIndex = node["parameters"]["waitElementIframeIndex"]
  971. self.print_and_log("等待元素出现:", waitElement)
  972. self.print_and_log(
  973. "Waiting for element to appear:", waitElement)
  974. if waitElementIframeIndex > 0:
  975. iframes = self.browser.find_elements(
  976. By.CSS_SELECTOR, "iframe", iframe=False)
  977. iframe = iframes[waitElementIframeIndex - 1]
  978. self.browser.switch_to.frame(iframe)
  979. WebDriverWait(self.browser, waitElementTime).until(
  980. EC.presence_of_element_located((By.XPATH, waitElement))
  981. )
  982. if waitElementIframeIndex > 0:
  983. self.browser.switch_to.default_content()
  984. except Exception as e:
  985. if waitElement != "":
  986. self.print_and_log("等待元素出现超时:", waitElement, ",将继续执行。")
  987. self.print_and_log("Timeout waiting for element to appear:",
  988. waitElement, ", will continue to execute.")
  989. self.recordLog(e)
  990. self.recordLog("Wait element not found")
  991. self.recordLog("执行节点|Execute node:", node["title"])
  992. try:
  993. # 根据不同选项执行不同操作
  994. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  995. for i in node["sequence"]: # 从根节点开始向下读取
  996. self.executeNode(i, loopValue, loopPath, index)
  997. elif node["option"] == 1: # 打开网页操作
  998. # if not (nodeId == 1 and self.service["cloudflare"] == 1):
  999. self.openPage(node["parameters"], loopValue)
  1000. elif node["option"] == 2: # 点击元素
  1001. self.clickElement(node["parameters"], loopValue, loopPath, index)
  1002. elif node["option"] == 3: # 提取数据
  1003. # 针对提取数据操作,设置操作开始的步骤,用于不小心关闭后的恢复的增量采集
  1004. if self.totalSteps >= self.startSteps:
  1005. self.getData(node["parameters"], loopValue, node["isInLoop"],
  1006. parentPath=loopPath, index=index)
  1007. self.saveData()
  1008. else:
  1009. # self.getDataStep += 1
  1010. self.print_and_log("跳过第" + str(self.totalSteps) + "次提取数据。")
  1011. self.print_and_log(
  1012. "Skip the " + str(self.totalSteps) + "th data extraction.")
  1013. self.totalSteps += 1 # 总步数加一
  1014. elif node["option"] == 4: # 输入文字
  1015. self.inputInfo(node["parameters"], loopValue)
  1016. elif node["option"] == 5: # 自定义操作
  1017. self.customOperation(node, loopValue, loopPath, index)
  1018. self.saveData()
  1019. elif node["option"] == 6: # 切换下拉框
  1020. self.switchSelect(node["parameters"], loopValue)
  1021. elif node["option"] == 7: # 鼠标移动到元素上
  1022. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  1023. elif node["option"] == 8: # 循环
  1024. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  1025. elif node["option"] == 9: # 条件分支
  1026. self.judgeExecute(node, loopValue, loopPath, index)
  1027. except Exception as e:
  1028. self.print_and_log("执行节点<" + node["title"] + ">时出错,将继续执行,错误为:", e)
  1029. self.print_and_log("Error executing node <" + node["title"] + ">, will continue to execute, error is:", e)
  1030. # 执行完之后进行等待
  1031. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  1032. waitTime = 0.01 # 默认等待0.01秒
  1033. if node["parameters"]["wait"] >= 0:
  1034. waitTime = node["parameters"]["wait"]
  1035. try:
  1036. waitType = int(node["parameters"]["waitType"])
  1037. except:
  1038. waitType = 0
  1039. if waitType == 0: # 固定等待时间
  1040. time.sleep(waitTime)
  1041. elif waitType == 1: # 随机等待时间
  1042. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  1043. self.event.wait() # 等待事件结束
  1044. # 对判断条件的处理
  1045. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  1046. executeBranchId = 0 # 要执行的BranchId
  1047. for i in node["sequence"]:
  1048. cnode = self.procedure[i] # 获得条件分支
  1049. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  1050. if tType == 0: # 什么条件都没有
  1051. executeBranchId = i
  1052. break
  1053. elif tType == 1: # 当前页面包含文本
  1054. try:
  1055. bodyText = self.browser.find_element(
  1056. By.CSS_SELECTOR, "body", iframe=cnode["parameters"]["iframe"]).text
  1057. value = replace_field_values(
  1058. cnode["parameters"]["value"], self.outputParameters, self)
  1059. if bodyText.find(value) >= 0:
  1060. executeBranchId = i
  1061. break
  1062. except: # 找不到元素下一个条件
  1063. continue
  1064. elif tType == 2: # 当前页面包含元素
  1065. try:
  1066. xpath = replace_field_values(
  1067. cnode["parameters"]["value"], self.outputParameters, self)
  1068. if self.browser.find_element(By.XPATH, xpath, iframe=cnode["parameters"]["iframe"]):
  1069. executeBranchId = i
  1070. break
  1071. except: # 找不到元素或者xpath写错了,下一个条件
  1072. continue
  1073. elif tType == 3: # 当前循环元素包括文本
  1074. try:
  1075. value = replace_field_values(
  1076. cnode["parameters"]["value"], self.outputParameters, self)
  1077. if loopElement.text.find(value) >= 0:
  1078. executeBranchId = i
  1079. break
  1080. except: # 找不到元素或者xpath写错了,下一个条件
  1081. continue
  1082. elif tType == 4: # 当前循环元素包括元素
  1083. try:
  1084. xpath = replace_field_values(
  1085. cnode["parameters"]["value"][1:], self.outputParameters, self)
  1086. if loopElement.find_element(By.XPATH, xpath):
  1087. executeBranchId = i
  1088. break
  1089. except: # 找不到元素或者xpath写错了,下一个条件
  1090. continue
  1091. elif tType <= 8: # JS命令返回值
  1092. if tType == 5: # JS命令返回值等于
  1093. output = self.execute_code(
  1094. 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"],
  1095. iframe=cnode["parameters"]["iframe"])
  1096. elif tType == 6: # System
  1097. output = self.execute_code(
  1098. 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"],
  1099. iframe=cnode["parameters"]["iframe"])
  1100. elif tType == 7: # 针对当前循环项的JS命令返回值
  1101. output = self.execute_code(
  1102. 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement,
  1103. iframe=cnode["parameters"]["iframe"])
  1104. elif tType == 8: # 针对当前循环项的System命令返回值
  1105. output = self.execute_code(
  1106. 6, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement,
  1107. iframe=cnode["parameters"]["iframe"])
  1108. try:
  1109. if output.find("rue") != -1: # 如果返回值中包含true
  1110. code = 1
  1111. else:
  1112. code = int(output)
  1113. except:
  1114. code = 0
  1115. if code > 0:
  1116. executeBranchId = i
  1117. break
  1118. if executeBranchId != 0:
  1119. self.executeNode(executeBranchId, loopElement, clickPath, index)
  1120. else:
  1121. self.recordLog(
  1122. "判断条件内所有条件分支的条件都不满足|None of the conditions in the judgment condition are met")
  1123. def handleHistory(self, node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, element=None, elements=None):
  1124. try:
  1125. changed_handle = self.browser.current_window_handle != thisHandle
  1126. except: # 如果网页被意外关闭了的情况下
  1127. self.browser.switch_to.window(
  1128. self.browser.window_handles[-1])
  1129. changed_handle = self.browser.window_handles[-1] != thisHandle
  1130. if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
  1131. try:
  1132. while True: # 一直关闭窗口直到当前标签页
  1133. self.browser.close() # 关闭使用完的标签页
  1134. self.browser.switch_to.window(
  1135. self.browser.window_handles[-1])
  1136. if self.browser.current_window_handle == thisHandle:
  1137. break
  1138. except Exception as e:
  1139. self.print_and_log("关闭标签页发生错误:", e)
  1140. self.print_and_log(
  1141. "Error occurred while closing tab: ", e)
  1142. if self.history["index"] != thisHistoryLength and self.history["handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  1143. difference = thisHistoryLength - self.history["index"] # 计算历史记录变化差值
  1144. self.browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录
  1145. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  1146. time.sleep(node["parameters"]["historyWait"])
  1147. # else:
  1148. # time.sleep(2)
  1149. try:
  1150. self.browser.execute_script('window.stop()')
  1151. except:
  1152. pass
  1153. ti = 0
  1154. # print("CURRENT URL:", self.browser.current_url)
  1155. # time.sleep(2)
  1156. # if self.browser.current_url.startswith("data:") or self.browser.current_url.startswith("chrome:"):
  1157. if self.browser.current_url != thisHistoryURL and self.history["index"] != thisHistoryLength and self.history["handle"] == self.browser.current_window_handle:
  1158. while self.browser.current_url != thisHistoryURL: # 如果执行完一次循环之后网址发生了变化
  1159. try:
  1160. self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
  1161. except: # 超时的情况下
  1162. pass
  1163. ti += 1
  1164. if self.browser.current_url == thisHistoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
  1165. break
  1166. time.sleep(2)
  1167. if xpath != "":
  1168. if element == None: # 不固定元素列表
  1169. element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
  1170. else: # 固定元素列表
  1171. element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
  1172. # if index > 0:
  1173. # index -= 1 # 如果是data:开头的网址,就要重试一次
  1174. else:
  1175. if element == None:
  1176. element = elements
  1177. return index, element
  1178. # 对循环的处理
  1179. def loopExecute(self, node, loopValue, clickPath="", index=0):
  1180. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  1181. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  1182. try:
  1183. thisHistoryLength = self.browser.execute_script(
  1184. 'return history.length') # 记录本次循环内的history的length
  1185. except:
  1186. thisHistoryLength = 0
  1187. self.history["index"] = thisHistoryLength
  1188. self.history["handle"] = thisHandle
  1189. thisHistoryURL = self.browser.current_url
  1190. # 快速提取处理
  1191. if node["parameters"]["quickExtractable"]:
  1192. self.browser.switch_to.default_content() # 切换到主页面
  1193. tree = html.fromstring(self.browser.page_source)
  1194. if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  1195. baseXPath = replace_field_values(node["parameters"]["baseXPath"], self.outputParameters, self)
  1196. rows = tree.xpath(baseXPath)
  1197. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  1198. rows = []
  1199. for path in node["parameters"]["baseXPath"].split("\n"):
  1200. baseXPath = replace_field_values(path, self.outputParameters, self)
  1201. rows.extend(tree.xpath(baseXPath))
  1202. for row in rows:
  1203. if node["parameters"]["clear"] == 1:
  1204. self.clearOutputParameters()
  1205. for param in node["parameters"]["quickParams"]:
  1206. xpath = replace_field_values(param["xpath"], self.outputParameters, self)
  1207. content = row.xpath(xpath)
  1208. try:
  1209. content = ' '.join(result.strip()
  1210. for result in content if result.strip())
  1211. # 链接或者图片的情况下,合并链接相对路径为绝对路径
  1212. if param["nodeType"] == 2 or param["nodeType"] == 4:
  1213. base_url = self.browser.current_url
  1214. # 合并链接相对路径为绝对路径
  1215. content = urljoin(base_url, content)
  1216. if len(content) == 0:
  1217. content = param["default"]
  1218. except:
  1219. content = param["default"]
  1220. self.outputParameters[param["name"]] = content
  1221. if node["parameters"]["newLine"]:
  1222. line = new_line(self.outputParameters,
  1223. self.maxViewLength, self.outputParametersRecord)
  1224. self.OUTPUT.append(line)
  1225. self.saveData()
  1226. elif int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  1227. # 无跳转标签页操作
  1228. count = 0 # 执行次数
  1229. bodyText = "-"
  1230. while True: # do while循环
  1231. try:
  1232. finished = False
  1233. if node["parameters"]["exitCount"] == 0:
  1234. # newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
  1235. # 用find_elements获取所有匹配到的文本
  1236. try:
  1237. exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
  1238. newBodyText = ""
  1239. for exitElement in exitElements:
  1240. newBodyText += exitElement.text
  1241. except Exception as e:
  1242. self.print_and_log(f"设定的退出循环元素:{node['parameters']['exitElement']}的文本无法获取,本次循环将不再检测元素文本是否变化,将会继续执行,为解决此问题,您可以修改检测元素文本不变的元素为其他元素,或者将循环次数设定为固定次数大于0的值。")
  1243. self.print_and_log(f"The text of the exit loop element set: {node['parameters']['exitElement']} cannot be obtained, this loop will no longer check whether the text of the element has changed, and will continue to execute. To solve this problem, you can modify the element whose text does not change to other elements, or set the number of loops to a fixed number greater than 0.")
  1244. self.print_and_log(e)
  1245. exitElements = []
  1246. # newBodyText为随机文本,保证一直执行
  1247. newBodyText = str(random.random())
  1248. if node["parameters"]["iframe"]: # 如果标记了iframe
  1249. iframes = self.browser.find_elements(
  1250. By.CSS_SELECTOR, "iframe", iframe=False)
  1251. for iframe in iframes:
  1252. self.browser.switch_to.default_content()
  1253. self.browser.switch_to.frame(iframe)
  1254. iframe_text = super(self.browser.__class__, self.browser).find_element(
  1255. By.CSS_SELECTOR, "body").text # 用super调用父类的方法
  1256. newBodyText += iframe_text
  1257. self.browser.switch_to.default_content()
  1258. if newBodyText == bodyText: # 如果页面内容无变化
  1259. self.print_and_log("页面已检测不到新内容,停止循环。")
  1260. self.print_and_log(
  1261. "No new content detected on the page, stop loop.")
  1262. finished = True
  1263. break
  1264. else:
  1265. self.print_and_log("检测到页面变化,继续循环。")
  1266. self.print_and_log(
  1267. "Page changed detected, continue loop.")
  1268. bodyText = newBodyText
  1269. xpath = replace_field_values(
  1270. node["parameters"]["xpath"], self.outputParameters, self)
  1271. # self.recordLog("循环元素|Loop element:", xpath)
  1272. element = self.browser.find_element(
  1273. By.XPATH, xpath, iframe=node["parameters"]["iframe"])
  1274. for i in node["sequence"]: # 挨个执行操作
  1275. self.executeNode(
  1276. i, element, xpath, 0)
  1277. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1278. self.CONTINUE = False
  1279. break
  1280. if self.BREAK: # 如果有break操作,退出循环
  1281. self.BREAK = False
  1282. finished = True
  1283. break
  1284. finished = True
  1285. except NoSuchElementException:
  1286. # except:
  1287. self.print_and_log("Single loop element not found: ",
  1288. xpath)
  1289. self.print_and_log("找不到要循环的单个元素: ", xpath)
  1290. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  1291. if node["option"] != 2:
  1292. self.executeNode(
  1293. i, None, xpath, 0)
  1294. finished = True
  1295. break # 如果找不到元素,退出循环
  1296. finally:
  1297. if not finished:
  1298. self.print_and_log("\n\n-------Retrying-------\n\n")
  1299. self.print_and_log("-------Retrying-------: ",
  1300. node["parameters"]["xpath"])
  1301. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  1302. if node["option"] != 2:
  1303. self.executeNode(
  1304. i, None, xpath, 0)
  1305. break # 如果找不到元素,退出循环
  1306. count = count + 1
  1307. self.print_and_log("Page: ", count)
  1308. # self.print_and_log(node["parameters"]["exitCount"], "-------")
  1309. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  1310. break
  1311. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  1312. output = self.execute_code(int(
  1313. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
  1314. node["parameters"]["breakCodeWaitTime"],
  1315. iframe=node["parameters"]["iframe"])
  1316. code = get_output_code(output)
  1317. if code <= 0:
  1318. break
  1319. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  1320. try:
  1321. xpath = replace_field_values(
  1322. node["parameters"]["xpath"], self.outputParameters, self)
  1323. elements = self.browser.find_elements(By.XPATH,
  1324. xpath, iframe=node["parameters"]["iframe"])
  1325. # self.recordLog("循环元素|Loop element:", xpath)
  1326. if len(elements) == 0:
  1327. self.print_and_log("Loop element not found: ",
  1328. xpath)
  1329. self.print_and_log("找不到循环元素:", xpath)
  1330. index = 0
  1331. skipCount = node["parameters"]["skipCount"]
  1332. while index < len(elements):
  1333. if index < skipCount:
  1334. index += 1
  1335. self.print_and_log("跳过第" + str(index) + "个元素")
  1336. self.print_and_log("Skip the " + str(index) + "th element")
  1337. continue
  1338. try:
  1339. element = elements[index]
  1340. element_text = element.text
  1341. except StaleElementReferenceException: # 如果元素已经失效,重试
  1342. self.print_and_log("元素已失效,重新获取元素|Element has expired, reacquiring element")
  1343. elements = self.browser.find_elements(By.XPATH,
  1344. xpath, iframe=node["parameters"]["iframe"])
  1345. element = elements[index]
  1346. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  1347. self.executeNode(i, element,
  1348. xpath, index)
  1349. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1350. self.CONTINUE = False
  1351. break
  1352. if self.BREAK:
  1353. self.BREAK = False
  1354. break
  1355. index, elements = self.handleHistory(node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, elements=elements)
  1356. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  1357. output = self.execute_code(int(
  1358. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
  1359. node["parameters"]["breakCodeWaitTime"],
  1360. iframe=node["parameters"]["iframe"])
  1361. code = get_output_code(output)
  1362. if code <= 0:
  1363. break
  1364. index = index + 1
  1365. except NoSuchElementException:
  1366. self.print_and_log("Loop element not found: ", xpath)
  1367. self.print_and_log("找不到循环元素:", xpath)
  1368. except Exception as e:
  1369. raise
  1370. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  1371. # 千万不要忘了分割!!
  1372. paths = node["parameters"]["pathList"].split("\n")
  1373. # for path in node["parameters"]["pathList"].split("\n"):
  1374. index = 0
  1375. skipCount = node["parameters"]["skipCount"]
  1376. while index < len(paths):
  1377. if index < skipCount:
  1378. index += 1
  1379. self.print_and_log("跳过第" + str(index) + "个元素")
  1380. self.print_and_log("Skip the " + str(index) + "th element")
  1381. continue
  1382. path = paths[index]
  1383. try:
  1384. path = replace_field_values(
  1385. path, self.outputParameters, self)
  1386. element = self.browser.find_element(
  1387. By.XPATH, path, iframe=node["parameters"]["iframe"])
  1388. # self.recordLog("循环元素|Loop element:", path)
  1389. for i in node["sequence"]: # 挨个执行操作
  1390. self.executeNode(i, element, path, 0)
  1391. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1392. self.CONTINUE = False
  1393. break
  1394. if self.BREAK:
  1395. self.BREAK = False
  1396. break
  1397. index, element = self.handleHistory(node, path, thisHandle, thisHistoryURL, thisHistoryLength, index, element=element)
  1398. except NoSuchElementException:
  1399. self.print_and_log("Loop element not found: ", path)
  1400. self.print_and_log("找不到循环元素:", path)
  1401. index += 1
  1402. continue # 循环中找不到元素就略过操作
  1403. except Exception as e:
  1404. raise
  1405. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  1406. output = self.execute_code(int(
  1407. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
  1408. node["parameters"]["breakCodeWaitTime"],
  1409. iframe=node["parameters"]["iframe"])
  1410. code = get_output_code(output)
  1411. if code <= 0:
  1412. break
  1413. index = index + 1
  1414. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  1415. textList = node["parameters"]["textList"].split("\n")
  1416. if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量
  1417. textList = replace_field_values(
  1418. node["parameters"]["textList"], self.outputParameters, self).split("\n")
  1419. skipCount = node["parameters"]["skipCount"]
  1420. index = 0
  1421. for text in textList:
  1422. if index < skipCount:
  1423. index += 1
  1424. self.print_and_log("跳过第" + str(index) + "个文本")
  1425. self.print_and_log("Skip the " + str(index) + "th text")
  1426. continue
  1427. text = replace_field_values(text, self.outputParameters, self)
  1428. # self.recordLog("当前循环文本|Current loop text:", text)
  1429. for i in node["sequence"]: # 挨个执行操作
  1430. self.executeNode(i, text, "", 0)
  1431. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1432. self.CONTINUE = False
  1433. break
  1434. if self.BREAK:
  1435. self.BREAK = False
  1436. break
  1437. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  1438. output = self.execute_code(int(
  1439. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
  1440. node["parameters"]["breakCodeWaitTime"],
  1441. iframe=node["parameters"]["iframe"])
  1442. code = get_output_code(output)
  1443. if code <= 0:
  1444. break
  1445. index, _ = self.handleHistory(node, "", thisHandle, thisHistoryURL, thisHistoryLength, index)
  1446. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  1447. # tempList = node["parameters"]["textList"].split("\r\n")
  1448. urlList = list(
  1449. filter(isnotnull, node["parameters"]["textList"].split("\n"))) # 去空行
  1450. if len(urlList) == 1: # 如果固定网址列表只有一行,现在就可以替换变量
  1451. urlList = replace_field_values(
  1452. node["parameters"]["textList"], self.outputParameters, self).split("\n")
  1453. skipCount = node["parameters"]["skipCount"]
  1454. index = 0
  1455. for url in urlList:
  1456. if index < skipCount:
  1457. index += 1
  1458. self.print_and_log("跳过第" + str(index) + "个网址")
  1459. self.print_and_log("Skip the " + str(index) + "th url")
  1460. continue
  1461. url = replace_field_values(url, self.outputParameters, self)
  1462. # self.recordLog("当前循环网址|Current loop url:", url)
  1463. for i in node["sequence"]:
  1464. self.executeNode(i, url, "", 0)
  1465. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1466. self.CONTINUE = False
  1467. break
  1468. if self.BREAK:
  1469. self.BREAK = False
  1470. break
  1471. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  1472. output = self.execute_code(int(
  1473. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
  1474. node["parameters"]["breakCodeWaitTime"],
  1475. iframe=node["parameters"]["iframe"])
  1476. code = get_output_code(output)
  1477. if code <= 0:
  1478. break
  1479. elif int(node["parameters"]["loopType"]) <= 7: # 命令返回值
  1480. while True: # do while循环
  1481. if int(node["parameters"]["loopType"]) == 5: # JS
  1482. output = self.execute_code(
  1483. 0, node["parameters"]["code"], node["parameters"]["waitTime"],
  1484. iframe=node["parameters"]["iframe"])
  1485. elif int(node["parameters"]["loopType"]) == 6: # System
  1486. output = self.execute_code(
  1487. 1, node["parameters"]["code"], node["parameters"]["waitTime"],
  1488. iframe=node["parameters"]["iframe"])
  1489. elif int(node["parameters"]["loopType"]) == 7: # Python
  1490. output = self.execute_code(
  1491. 6, node["parameters"]["code"], node["parameters"]["waitTime"],
  1492. iframe=node["parameters"]["iframe"])
  1493. code = get_output_code(output)
  1494. if code <= 0:
  1495. break
  1496. for i in node["sequence"]: # 挨个执行操作
  1497. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  1498. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  1499. self.CONTINUE = False
  1500. break
  1501. if self.BREAK:
  1502. self.BREAK = False
  1503. break
  1504. self.history["index"] = thisHistoryLength
  1505. self.history["handle"] = self.browser.current_window_handle
  1506. self.scrollDown(node["parameters"])
  1507. # 打开网页操作
  1508. def openPage(self, param, loopValue):
  1509. time.sleep(1) # 打开网页后强行等待至少1秒
  1510. if len(self.browser.window_handles) > 1:
  1511. self.browser.switch_to.window(
  1512. self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  1513. try:
  1514. self.browser.close()
  1515. except:
  1516. pass
  1517. self.browser.switch_to.window(
  1518. self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  1519. self.history["handle"] = self.browser.current_window_handle
  1520. if param["useLoop"]:
  1521. url = loopValue
  1522. elif param["url"] != "about:blank":
  1523. url = self.links[self.urlId]
  1524. # clear output parameters
  1525. for key in self.outputParameters:
  1526. self.outputParameters[key] = ""
  1527. else: # 在流程图其他位置设置了打开网页的操作,读取的应该是第一个网址,如打开网页后登录,再打开第二个网页
  1528. url = list(filter(isnotnull, param["links"].split("\n")))[0]
  1529. # 将value中的Field[""]替换为outputParameters中的键值
  1530. url = replace_field_values(url, self.outputParameters, self)
  1531. try:
  1532. maxWaitTime = int(param["maxWaitTime"])
  1533. except:
  1534. maxWaitTime = 10 # 默认最大等待时间为10秒
  1535. try:
  1536. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  1537. self.browser.set_script_timeout(maxWaitTime)
  1538. self.browser.get(url)
  1539. if param["cookies"] != "":
  1540. self.browser.delete_all_cookies() # 清除所有已有cookie
  1541. cookies = param["cookies"].split('\n')
  1542. for cookie in cookies:
  1543. name, value = cookie.split('=', 1)
  1544. cookie_dict = {'name': name, 'value': value}
  1545. # 加载 cookie
  1546. self.browser.add_cookie(cookie_dict)
  1547. self.print_and_log('加载页面|Loading page: ' + url)
  1548. except TimeoutException:
  1549. self.print_and_log(
  1550. 'Time out after set seconds when loading page: ' + url)
  1551. try:
  1552. self.browser.execute_script('window.stop()')
  1553. except:
  1554. pass
  1555. except Exception as e:
  1556. self.print_and_log("Failed to load page: " + url)
  1557. try:
  1558. self.history["index"] = self.browser.execute_script(
  1559. "return history.length")
  1560. except TimeoutException:
  1561. try:
  1562. self.browser.execute_script('window.stop()')
  1563. self.history["index"] = self.browser.execute_script(
  1564. "return history.length")
  1565. except:
  1566. self.history["index"] = 0
  1567. except Exception as e:
  1568. self.print_and_log("History Length Error")
  1569. self.history["index"] = 0
  1570. self.scrollDown(param) # 控制屏幕向下滚动
  1571. # 键盘输入操作
  1572. def inputInfo(self, param, loopValue):
  1573. time.sleep(0.1) # 输入之前等待0.1秒
  1574. try:
  1575. xpath = replace_field_values(
  1576. param["xpath"], self.outputParameters, self)
  1577. textbox = self.browser.find_element(
  1578. By.XPATH, xpath, iframe=param["iframe"])
  1579. # textbox.send_keys(Keys.CONTROL, 'a')
  1580. # textbox.send_keys(Keys.BACKSPACE)
  1581. self.execute_code(
  1582. 2, param["beforeJS"], param["beforeJSWaitTime"], textbox, iframe=param["iframe"]) # 执行前置JS
  1583. # Send the HOME key
  1584. textbox.send_keys(Keys.HOME)
  1585. # Send the SHIFT + END key combination
  1586. textbox.send_keys(Keys.SHIFT, Keys.END)
  1587. # Send the DELETE key
  1588. textbox.send_keys(Keys.DELETE)
  1589. value = ""
  1590. if param["useLoop"]:
  1591. value = loopValue
  1592. else:
  1593. value = param["value"]
  1594. # 将value中的Field[""]替换为outputParameters中的键值
  1595. # pattern = r'Field\["([^"]+)"\]'
  1596. try:
  1597. # replaced_text = re.sub(
  1598. # pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  1599. replaced_text = replace_field_values(value, self.outputParameters, self)
  1600. replaced_text = re.sub(
  1601. '<enter>', '', replaced_text, flags=re.IGNORECASE)
  1602. except:
  1603. replaced_text = value
  1604. index = param["index"]
  1605. if index != 0:
  1606. try:
  1607. replaced_text = replaced_text.split("~")[index - 1]
  1608. except:
  1609. self.print_and_log("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
  1610. self.print_and_log(
  1611. "Failed to get value, maybe because the index is out of range, will use the entire text value")
  1612. textbox.send_keys(replaced_text)
  1613. if value.lower().find("<enter>") >= 0:
  1614. textbox.send_keys(Keys.ENTER)
  1615. self.recordLog("输入文字|Input text: " +
  1616. replaced_text + " to " + xpath)
  1617. self.execute_code(
  1618. 2, param["afterJS"], param["afterJSWaitTime"], textbox, iframe=param["iframe"]) # 执行后置js
  1619. except:
  1620. self.print_and_log("Cannot find input box element:" +
  1621. xpath + ", please try to set the wait time before executing this operation")
  1622. self.print_and_log("找不到输入框元素:" + xpath + ",请尝试在执行此操作前设置等待时间")
  1623. # 点击元素操作
  1624. def clickElement(self, param, loopElement=None, clickPath="", index=0):
  1625. try:
  1626. maxWaitTime = int(param["maxWaitTime"])
  1627. except:
  1628. maxWaitTime = 10
  1629. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  1630. self.browser.set_script_timeout(maxWaitTime)
  1631. # 点击前对该元素执行一段JavaScript代码
  1632. try:
  1633. # element = self.browser.find_element(
  1634. # By.XPATH, path, iframe=param["iframe"])
  1635. clickPath = replace_field_values(
  1636. clickPath, self.outputParameters, self)
  1637. xpath = replace_field_values(
  1638. param["xpath"], self.outputParameters, self)
  1639. if xpath.find("point(") >= 0: # 如果xpath中包含point(),说明是相对坐标的点击
  1640. index = 0
  1641. path = "//body"
  1642. elif param["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  1643. if xpath == "":
  1644. path = clickPath
  1645. else:
  1646. path = "(" + clickPath + ")" + \
  1647. "[" + str(index + 1) + "]" + \
  1648. xpath
  1649. index = 0 # 如果是相对循环内元素的点击,在定位到元素后,index应该重置为0
  1650. # element = loopElement
  1651. else:
  1652. index = 0
  1653. path = xpath # 不然使用元素定义的xpath
  1654. # element = self.browser.find_element(
  1655. # By.XPATH, path, iframe=param["iframe"])
  1656. elements = self.browser.find_elements(
  1657. By.XPATH, path, iframe=param["iframe"])
  1658. element = elements[index]
  1659. if param["beforeJS"] != "":
  1660. self.execute_code(2, param["beforeJS"],
  1661. param["beforeJSWaitTime"], element, iframe=param["iframe"])
  1662. except:
  1663. self.print_and_log("Cannot find element:" +
  1664. path + ", please try to set the wait time before executing this operation")
  1665. self.print_and_log("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  1666. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  1667. try:
  1668. click_way = int(param["clickWay"])
  1669. except:
  1670. click_way = 0
  1671. try:
  1672. newTab = int(param["newTab"])
  1673. except:
  1674. newTab = 0
  1675. try:
  1676. if xpath.find("point(") >= 0: # 如果xpath中包含point(),说明是相对坐标的点击
  1677. point = xpath.split("point(")[1].split(")")[0].split(",")
  1678. x = int(point[0])
  1679. y = int(point[1])
  1680. # try:
  1681. # actions = ActionChains(self.browser) # 实例化一个action对象
  1682. # actions.move_to_element(element).perform()
  1683. # actions.move_by_offset(x, y).perform()
  1684. # actions.click().perform()
  1685. # except Exception as e:
  1686. script = "document.elementFromPoint(" + str(x) + "," + str(y) + ").click();"
  1687. self.browser.execute_script(script)
  1688. elif click_way == 0: # 用selenium的点击方法
  1689. try:
  1690. actions = ActionChains(self.browser) # 实例化一个action对象
  1691. if newTab == 1: # 在新标签页打开
  1692. if sys.platform == "darwin": # Mac
  1693. actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
  1694. else:
  1695. # Ctrl + Click
  1696. actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
  1697. else:
  1698. actions.click(element).perform()
  1699. except Exception as e:
  1700. self.browser.execute_script("arguments[0].scrollIntoView();", element)
  1701. try:
  1702. actions = ActionChains(self.browser) # 实例化一个action对象
  1703. actions.click(element).perform()
  1704. except Exception as e:
  1705. self.print_and_log(f"Selenium点击元素{path}失败,将尝试使用JavaScript点击")
  1706. self.print_and_log(f"Failed to click element {path} with Selenium, will try to click with JavaScript")
  1707. script = 'var result = document.evaluate(`' + path + \
  1708. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  1709. self.browser.execute_script(script, str(index)) # 用js的点击方法
  1710. elif click_way == 1: # 用js的点击方法
  1711. script = 'var result = document.evaluate(`' + path + \
  1712. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  1713. self.browser.execute_script(script, str(index)) # 用js的点击方法
  1714. elif click_way == 2: # 双击
  1715. try:
  1716. actions = ActionChains(self.browser) # 实例化一个action对象
  1717. actions.double_click(element).perform()
  1718. except Exception as e:
  1719. self.browser.execute_script("arguments[0].scrollIntoView();", element)
  1720. try:
  1721. actions = ActionChains(self.browser) # 实例化一个action对象
  1722. actions.double_click(element).perform()
  1723. except Exception as e:
  1724. self.print_and_log(f"Selenium双击元素{path}失败,将尝试使用JavaScript双击")
  1725. self.print_and_log(f"Failed to double click element {path} with Selenium, will try to double click with JavaScript")
  1726. script = 'var result = document.evaluate(`' + path + \
  1727. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  1728. self.browser.execute_script(script, str(index)) # 用js的点击方法
  1729. self.recordLog("点击元素|Click element: " + path)
  1730. except TimeoutException:
  1731. self.print_and_log(
  1732. 'Time out after set seconds when loading clicked page')
  1733. try:
  1734. self.browser.execute_script('window.stop()')
  1735. except:
  1736. pass
  1737. except Exception as e:
  1738. self.print_and_log(
  1739. "点击元素失败:" + path, ",请尝试将点击类型改为JavaScript点击后重试。")
  1740. self.print_and_log("Failed to click element:" + path,
  1741. ", please try to change the click type to JavaScript Click.")
  1742. self.print_and_log(e)
  1743. # 弹窗处理
  1744. if param["alertHandleType"] > 0:
  1745. try:
  1746. time.sleep(1.5)
  1747. alert = self.browser.switch_to.alert
  1748. alertHandleType = int(param["alertHandleType"])
  1749. if alertHandleType == 1:
  1750. alert.accept()
  1751. self.print_and_log("已点击确认|Clicked OK")
  1752. elif alertHandleType == 2:
  1753. alert.dismiss()
  1754. self.print_and_log("已点击取消|Clicked Cancel")
  1755. except Exception as e:
  1756. self.print_and_log("找不到弹窗|Cannot find alert")
  1757. # 点击后对该元素执行一段JavaScript代码
  1758. try:
  1759. if param["afterJS"] != "":
  1760. element = self.browser.find_element(
  1761. By.XPATH, path, iframe=param["iframe"])
  1762. self.execute_code(2, param["afterJS"],
  1763. param["afterJSWaitTime"], element, iframe=param["iframe"])
  1764. except:
  1765. self.print_and_log("Cannot find element:" + path)
  1766. self.print_and_log("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  1767. waitTime = float(param["wait"]) + 0.01 # 点击之后等待
  1768. try:
  1769. waitType = int(param["waitType"])
  1770. except:
  1771. waitType = 0
  1772. if waitType == 0: # 固定等待时间
  1773. time.sleep(waitTime)
  1774. elif waitType == 1: # 随机等待时间
  1775. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  1776. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  1777. self.browser.switch_to.window(
  1778. self.browser.window_handles[-1]) # 跳转到新的标签页
  1779. self.history["handle"] = self.browser.current_window_handle
  1780. try:
  1781. self.history["index"] = self.browser.execute_script(
  1782. "return history.length")
  1783. except TimeoutException:
  1784. try:
  1785. self.browser.execute_script('window.stop()')
  1786. except:
  1787. pass
  1788. self.history["index"] = self.browser.execute_script(
  1789. "return history.length")
  1790. except Exception as e:
  1791. self.print_and_log("History Length Error")
  1792. self.history["index"] = 0
  1793. else:
  1794. try:
  1795. self.history["index"] = self.browser.execute_script(
  1796. "return history.length")
  1797. except TimeoutException:
  1798. try:
  1799. self.browser.execute_script('window.stop()')
  1800. except:
  1801. pass
  1802. self.history["index"] = self.browser.execute_script(
  1803. "return history.length")
  1804. # 如果打开了新窗口,切换到新窗口
  1805. except Exception as e:
  1806. self.print_and_log("History Length Error")
  1807. self.history["index"] = 0
  1808. self.scrollDown(param) # 根据参数配置向下滚动
  1809. def get_content(self, p, element):
  1810. content = ""
  1811. if p["contentType"] == 0:
  1812. # 先处理特殊节点类型
  1813. if p["nodeType"] == 2:
  1814. if element.get_attribute("href") != None:
  1815. content = element.get_attribute("href")
  1816. else:
  1817. content = ""
  1818. elif p["nodeType"] == 3:
  1819. if element.get_attribute("value") != None:
  1820. content = element.get_attribute("value")
  1821. else:
  1822. content = ""
  1823. elif p["nodeType"] == 4: # 图片
  1824. if element.get_attribute("src") != None:
  1825. content = element.get_attribute("src")
  1826. else:
  1827. content = ""
  1828. try:
  1829. downloadPic = p["downloadPic"]
  1830. except:
  1831. downloadPic = 0
  1832. if downloadPic == 1:
  1833. download_image(self, content, "Data/Task_" +
  1834. str(self.id) + "/" + self.saveName + "/images", element)
  1835. else: # 普通节点
  1836. if p["splitLine"] == 1:
  1837. text = extract_text_from_html(element.get_attribute('outerHTML'))
  1838. content = split_text_by_lines(text)
  1839. else:
  1840. content = element.text
  1841. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  1842. if p["nodeType"] == 2:
  1843. if element.get_attribute("href") != None:
  1844. content = element.get_attribute("href")
  1845. else:
  1846. content = ""
  1847. elif p["nodeType"] == 3:
  1848. if element.get_attribute("value") != None:
  1849. content = element.get_attribute("value")
  1850. else:
  1851. content = ""
  1852. elif p["nodeType"] == 4: # 图片
  1853. if element.get_attribute("src") != None:
  1854. content = element.get_attribute("src")
  1855. else:
  1856. content = ""
  1857. try:
  1858. downloadPic = p["downloadPic"]
  1859. except:
  1860. downloadPic = 0
  1861. if downloadPic == 1:
  1862. download_image(self, content, "Data/Task_" +
  1863. str(self.id) + "/" + self.saveName + "/images", element)
  1864. else:
  1865. command = 'var arr = [];\
  1866. var content = arguments[0];\
  1867. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  1868. if(content.childNodes[i].nodeType === 3){ \
  1869. arr.push(content.childNodes[i].nodeValue);\
  1870. }\
  1871. }\
  1872. var str = arr.join(" "); \
  1873. return str;'
  1874. content = self.browser.execute_script(command, element).replace(
  1875. "\n", "").replace("\\s+", " ")
  1876. elif p["contentType"] == 2:
  1877. content = element.get_attribute('innerHTML')
  1878. elif p["contentType"] == 3:
  1879. content = element.get_attribute('outerHTML')
  1880. elif p["contentType"] == 4:
  1881. # 获取元素的背景图片地址
  1882. bg_url = element.value_of_css_property('background-image')
  1883. # 清除背景图片地址中的多余字符
  1884. bg_url = bg_url.replace('url("', '').replace('")', '')
  1885. content = bg_url
  1886. elif p["contentType"] == 5:
  1887. content = self.browser.current_url
  1888. elif p["contentType"] == 6:
  1889. content = self.browser.title
  1890. elif p["contentType"] == 7:
  1891. # 获取整个网页的高度和宽度
  1892. size = self.browser.get_window_size()
  1893. width = size["width"]
  1894. height = size["height"]
  1895. # 调整浏览器窗口的大小
  1896. self.browser.set_window_size(width, height)
  1897. element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
  1898. "/" + str(time.time()) + ".png")
  1899. # 截图完成后,将浏览器的窗口大小设置为原来的大小
  1900. self.browser.set_window_size(width, height)
  1901. elif p["contentType"] == 8:
  1902. try:
  1903. size = self.browser.get_window_size()
  1904. width = size["width"]
  1905. height = size["height"]
  1906. screenshot = element.screenshot_as_png
  1907. screenshot_stream = io.BytesIO(screenshot)
  1908. # 使用Pillow库打开截图,并转换为灰度图像
  1909. image = Image.open(screenshot_stream).convert('L')
  1910. temp_name = "OCR_" + str(time.time()) + ".png"
  1911. location = "Data/Task_" + \
  1912. str(self.id) + "/" + self.saveName + "/" + temp_name
  1913. image.save(location)
  1914. ocr = DdddOcr(show_ad=False)
  1915. with open(location, 'rb') as f:
  1916. image_bytes = f.read()
  1917. content = ocr.classification(image_bytes)
  1918. os.remove(location)
  1919. self.browser.set_window_size(width, height)
  1920. # 使用Tesseract OCR引擎识别图像中的文本
  1921. # content = pytesseract.image_to_string(image, lang='chi_sim+eng')
  1922. except Exception as e:
  1923. # try:
  1924. # self.print_and_log(e)
  1925. # self.print_and_log("识别中文失败,尝试只识别英文")
  1926. # self.print_and_log("Failed to recognize Chinese, try to recognize English only")
  1927. # screenshot = element.screenshot_as_png
  1928. # screenshot_stream = io.BytesIO(screenshot)
  1929. # # 使用Pillow库打开截图,并转换为灰度图像
  1930. # image = Image.open(screenshot_stream).convert('L')
  1931. # # 使用Tesseract OCR引擎识别图像中的文本
  1932. # # content = pytesseract.image_to_string(image, lang='eng')
  1933. # except Exception as e:
  1934. content = "OCR Error"
  1935. self.print_and_log(e)
  1936. # if sys.platform == "win32":
  1937. # self.print_and_log("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501")
  1938. # self.print_and_log("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
  1939. # elif sys.platform == "darwin":
  1940. # self.print_and_log(
  1941. # "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  1942. # elif sys.platform == "linux":
  1943. # self.print_and_log(
  1944. # "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  1945. # else:
  1946. # self.print_and_log("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501")
  1947. # self.print_and_log("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
  1948. # self.print_and_log("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  1949. elif p["contentType"] == 9:
  1950. content = self.execute_code(
  1951. 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
  1952. elif p["contentType"] == 12: # 系统命令返回值
  1953. content = self.execute_code(1, p["JS"], p["JSWaitTime"])
  1954. elif p["contentType"] == 13: # eval返回值
  1955. content = self.execute_code(6, p["JS"], p["JSWaitTime"])
  1956. elif p["contentType"] == 10: # 下拉框选中的值
  1957. try:
  1958. select_element = Select(element)
  1959. content = select_element.first_selected_option.get_attribute(
  1960. "value")
  1961. except:
  1962. content = ""
  1963. elif p["contentType"] == 11: # 下拉框选中的文本
  1964. try:
  1965. select_element = Select(element)
  1966. content = select_element.first_selected_option.text
  1967. except:
  1968. content = ""
  1969. elif p["contentType"] == 14: # 元素属性值
  1970. attribute_name = p["JS"]
  1971. try:
  1972. content = element.get_attribute(attribute_name)
  1973. except:
  1974. content = ""
  1975. elif p["contentType"] == 15: # 常量值
  1976. content = p["JS"]
  1977. if content == None:
  1978. content = ""
  1979. return content
  1980. def clearOutputParameters(self):
  1981. for key in self.outputParameters:
  1982. self.outputParameters[key] = ""
  1983. self.recordLog("清空输出参数|Clear output parameters")
  1984. # 提取数据操作
  1985. def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
  1986. parentPath = replace_field_values(
  1987. parentPath, self.outputParameters, self)
  1988. if param["clear"] == 1:
  1989. self.clearOutputParameters()
  1990. try:
  1991. pageHTML = etree.HTML(self.browser.page_source)
  1992. except:
  1993. pageHTML = etree.HTML("")
  1994. if loopElement != "": # 只在数据在循环中提取时才需要获取循环元素
  1995. try:
  1996. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  1997. except:
  1998. try: # 循环点击每个链接如果没有新标签页打开,loopElement会丢失,此时需要重新获取
  1999. elements = self.browser.find_elements(
  2000. By.XPATH, parentPath, iframe=param["params"][0]["iframe"])
  2001. loopElement = elements[index]
  2002. loopElementOuterHTML = loopElement.get_attribute(
  2003. 'outerHTML')
  2004. except:
  2005. loopElementOuterHTML = ""
  2006. else:
  2007. loopElementOuterHTML = ""
  2008. loopElementHTML = etree.HTML(loopElementOuterHTML)
  2009. for p in param["params"]:
  2010. if p["optimizable"]:
  2011. try:
  2012. relativeXPath = replace_field_values(
  2013. p["relativeXPath"], self.outputParameters, self)
  2014. # 只有当前环境不变变化才可以快速提取数据
  2015. if self.browser.iframe_env != p["iframe"]:
  2016. # if p["iframe"] or self.browser.iframe_env != p["iframe"]: # 如果是iframe,则不能快速提取数据,主要是各个上下文的iframe切换,但一般不会有人这么做
  2017. p["optimizable"] = False
  2018. continue
  2019. # relativeXPath = relativeXPath.lower()
  2020. # relativeXPath = lowercase_tags_in_xpath(relativeXPath)
  2021. # 已经有text()或@href了,不需要再加
  2022. content_type = ""
  2023. if relativeXPath.find("/@href") >= 0 or relativeXPath.find("/text()") >= 0 or relativeXPath.find(
  2024. "::text()") >= 0:
  2025. content_type = ""
  2026. elif p["nodeType"] == 2:
  2027. content_type = "//@href"
  2028. elif p["nodeType"] == 4:
  2029. content_type = "//@src"
  2030. elif p["contentType"] == 1:
  2031. content_type = "/text()"
  2032. elif p["contentType"] == 0:
  2033. content_type = "//text()"
  2034. xpath = relativeXPath + content_type
  2035. if p["relative"]:
  2036. # if relativeXPath == "":
  2037. # content = [loopElementHTML]
  2038. # else:
  2039. # 如果字串里有//即子孙查找,则不动语句
  2040. if relativeXPath.find("//") >= 0:
  2041. if xpath.startswith("/"):
  2042. full_path = "(" + parentPath + ")" + \
  2043. "[" + str(index + 1) + "]" + \
  2044. relativeXPath + content_type
  2045. else: # 如果是id()这种形式,不需要包parentPath
  2046. full_path = xpath
  2047. try:
  2048. content = pageHTML.xpath(full_path)
  2049. except:
  2050. content = []
  2051. # 如果是id()这种形式,不需要包/html/body
  2052. elif not relativeXPath.startswith("/"):
  2053. try:
  2054. content = loopElementHTML.xpath(xpath)
  2055. except:
  2056. content = []
  2057. else:
  2058. content = loopElementHTML.xpath(
  2059. "/html/body/" + loopElementHTML[0][0].tag + xpath)
  2060. else:
  2061. # 如果是id()或(//div)[1]这种形式,不需要包/html/body
  2062. if xpath.find("/body") < 0 and xpath.startswith("/"):
  2063. xpath = "/html/body" + xpath
  2064. content = pageHTML.xpath(xpath)
  2065. if len(content) > 0:
  2066. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  2067. # 拼接所有文本内容并去掉两边的空白
  2068. content = ' '.join(result.strip()
  2069. for result in content if result.strip())
  2070. if p["nodeType"] == 2 or p["nodeType"] == 4:
  2071. base_url = self.browser.current_url
  2072. # 合并链接相对路径为绝对路径
  2073. content = urljoin(base_url, content)
  2074. else:
  2075. content = p["default"]
  2076. if not self.dataNotFoundKeys[p["name"]]:
  2077. self.print_and_log(
  2078. 'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  2079. relativeXPath, p["name"]))
  2080. self.print_and_log(
  2081. "提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  2082. p["name"], relativeXPath))
  2083. self.dataNotFoundKeys[p["name"]] = True
  2084. except Exception as e:
  2085. if not self.dataNotFoundKeys[p["name"]]:
  2086. self.print_and_log(
  2087. 'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  2088. relativeXPath, p["name"]))
  2089. self.print_and_log(
  2090. "提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
  2091. p["name"], relativeXPath))
  2092. self.dataNotFoundKeys[p["name"]] = True
  2093. try:
  2094. self.outputParameters[p["name"]] = content
  2095. except:
  2096. self.outputParameters[p["name"]] = p["default"]
  2097. # 对于不能优化的操作,使用selenium执行
  2098. for p in param["params"]:
  2099. if not p["optimizable"]:
  2100. content = ""
  2101. relativeXPath = replace_field_values(
  2102. p["relativeXPath"], self.outputParameters, self)
  2103. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  2104. try:
  2105. # relativeXPath = relativeXPath.lower()
  2106. # relativeXPath = lowercase_tags_in_xpath(relativeXPath)
  2107. if p["relative"]: # 是否相对xpath
  2108. if relativeXPath == "": # 相对xpath有时候就是元素本身,不需要二次查找
  2109. element = loopElement
  2110. else:
  2111. # 如果字串里有//即子孙查找,则不动语句
  2112. if relativeXPath.find("//") >= 0:
  2113. # full_path = "(" + parentPath + \
  2114. # relativeXPath + ")" + \
  2115. # "[" + str(index + 1) + "]"
  2116. full_path = "(" + parentPath + ")" + \
  2117. "[" + str(index + 1) + "]" + \
  2118. relativeXPath
  2119. element = self.browser.find_element(
  2120. By.XPATH, full_path, iframe=p["iframe"])
  2121. else:
  2122. element = loopElement.find_element(By.XPATH,
  2123. relativeXPath[1:])
  2124. else:
  2125. element = self.browser.find_element(
  2126. By.XPATH, relativeXPath, iframe=p["iframe"])
  2127. except (
  2128. NoSuchElementException, InvalidSelectorException, StaleElementReferenceException) as e: # 找不到元素的时候,使用默认值
  2129. # self.print_and_log(p)
  2130. try:
  2131. content = p["default"]
  2132. except Exception as e:
  2133. content = ""
  2134. self.outputParameters[p["name"]] = content
  2135. try:
  2136. if not self.dataNotFoundKeys[p["name"]]:
  2137. self.print_and_log(
  2138. 'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  2139. relativeXPath, p["name"]))
  2140. self.print_and_log(
  2141. "提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  2142. p["name"], relativeXPath))
  2143. self.dataNotFoundKeys[p["name"]] = True
  2144. except:
  2145. pass
  2146. continue
  2147. except TimeoutException: # 超时的时候设置超时值
  2148. self.print_and_log(
  2149. 'Time out after set seconds when getting data')
  2150. try:
  2151. self.browser.execute_script('window.stop()')
  2152. except:
  2153. pass
  2154. if p["relative"]: # 是否相对xpath
  2155. if relativeXPath == "": # 相对xpath有时候就是元素本身,不需要二次查找
  2156. element = loopElement
  2157. else:
  2158. element = loopElement.find_element(By.XPATH,
  2159. relativeXPath[1:])
  2160. else:
  2161. element = self.browser.find_element(
  2162. By.XPATH, relativeXPath, iframe=p["iframe"])
  2163. # rt.end()
  2164. else:
  2165. element = self.browser.find_element(
  2166. By.XPATH, "//body", iframe=p["iframe"])
  2167. try:
  2168. self.execute_code(
  2169. 2, p["beforeJS"], p["beforeJSWaitTime"], element, iframe=p["iframe"]) # 执行前置js
  2170. content = self.get_content(p, element)
  2171. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  2172. self.recordLog(
  2173. 'StaleElementReferenceException: ' + relativeXPath)
  2174. time.sleep(3)
  2175. try:
  2176. if p["relative"]: # 是否相对xpath
  2177. if relativeXPath == "": # 相对xpath有时候就是元素本身,不需要二次查找
  2178. element = loopElement
  2179. self.recordLog(
  2180. 'StaleElementReferenceException: loopElement')
  2181. else:
  2182. element = loopElement.find_element(By.XPATH,
  2183. relativeXPath[1:])
  2184. self.recordLog(
  2185. 'StaleElementReferenceException: loopElement+relativeXPath')
  2186. else:
  2187. element = self.browser.find_element(
  2188. By.XPATH, relativeXPath, iframe=p["iframe"])
  2189. self.recordLog(
  2190. 'StaleElementReferenceException: relativeXPath')
  2191. content = self.get_content(p, element)
  2192. except StaleElementReferenceException:
  2193. self.recordLog(
  2194. 'StaleElementReferenceException: ' + relativeXPath)
  2195. continue # 再出现类似问题直接跳过
  2196. self.outputParameters[p["name"]] = content
  2197. self.execute_code(
  2198. 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
  2199. if param["recordASField"] > 0 and param["newLine"]:
  2200. line = new_line(self.outputParameters,
  2201. self.maxViewLength, self.outputParametersRecord)
  2202. self.OUTPUT.append(line)
  2203. if __name__ == '__main__':
  2204. # 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
  2205. # If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
  2206. config = {
  2207. "ids": [0],
  2208. "saved_file_name": "",
  2209. "user_data": False,
  2210. "config_folder": "",
  2211. "config_file_name": "config.json",
  2212. "read_type": "remote",
  2213. "headless": False,
  2214. "server_address": "http://localhost:8074",
  2215. "keyboard": True, # 是否监听键盘输入
  2216. "pause_key": "p", # 暂停键
  2217. "version": "0.6.2",
  2218. "docker_driver": "",
  2219. }
  2220. c = Config(config)
  2221. print(c)
  2222. options = webdriver.ChromeOptions()
  2223. driver_path = "chromedriver.exe"
  2224. print(sys.platform, platform.architecture())
  2225. if not os.path.exists(os.getcwd() + "/Data"):
  2226. os.mkdir(os.getcwd() + "/Data")
  2227. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  2228. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  2229. options.add_extension(
  2230. "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  2231. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  2232. print(driver_path)
  2233. if c.config_folder == "":
  2234. c.config_folder = os.path.expanduser(
  2235. "~/Library/Application Support/EasySpider/")
  2236. elif os.path.exists(os.getcwd() + "/EasySpider/resources"): # 打包后的路径
  2237. print("Finding chromedriver in EasySpider",
  2238. os.getcwd() + "/EasySpider")
  2239. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  2240. options.binary_location = os.path.join(
  2241. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  2242. driver_path = os.path.join(
  2243. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  2244. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  2245. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  2246. options.binary_location = os.path.join(
  2247. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  2248. driver_path = os.path.join(
  2249. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  2250. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  2251. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  2252. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  2253. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  2254. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  2255. else:
  2256. print("Unsupported platform")
  2257. sys.exit()
  2258. print("Chrome location:", options.binary_location)
  2259. print("Chromedriver location:", driver_path)
  2260. elif os.path.exists(os.getcwd() + "/../ElectronJS"):
  2261. # 软件dev用
  2262. print("Finding chromedriver in EasySpider",
  2263. os.getcwd() + "/ElectronJS")
  2264. options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  2265. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  2266. options.add_extension("../ElectronJS/XPathHelper.crx")
  2267. else:
  2268. options.binary_location = "./chrome.exe" # 指定chrome位置
  2269. driver_path = "./chromedriver.exe"
  2270. options.add_extension("XPathHelper.crx")
  2271. options.add_experimental_option(
  2272. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  2273. # 总结:
  2274. # 0. 带Cookie需要用userdatadir
  2275. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  2276. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  2277. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  2278. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  2279. try:
  2280. with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
  2281. config = json.load(f)
  2282. print("Config file path: " +
  2283. c.config_folder + c.config_file_name)
  2284. absolute_user_data_folder = config["absolute_user_data_folder"]
  2285. except:
  2286. pass
  2287. options.add_argument(
  2288. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  2289. # 阻止http -> https的重定向
  2290. options.add_argument("--disable-features=CrossSiteDocumentBlockingIfIsolating,CrossSiteDocumentBlockingAlways,IsolateOrigins,site-per-process")
  2291. options.add_argument("--disable-web-security") # 禁用同源策略
  2292. options.add_argument('-ignore-certificate-errors')
  2293. options.add_argument('-ignore -ssl-errors')
  2294. if c.headless:
  2295. print("Headless mode")
  2296. print("无头模式")
  2297. options.add_argument("--headless")
  2298. tmp_options = []
  2299. for id in c.ids:
  2300. tmp_options.append({"options": copy.deepcopy(options), "tmp_user_data_folder": ""})
  2301. if c.user_data:
  2302. tmp_user_folder_parent = os.path.join(os.getcwd(), "TempUserDataFolder")
  2303. if not os.path.exists(tmp_user_folder_parent):
  2304. os.mkdir(tmp_user_folder_parent)
  2305. characters = string.ascii_letters + string.digits
  2306. for i in range(len(c.ids)):
  2307. id = c.ids[i]
  2308. # 从字符集中随机选择字符构成字符串
  2309. random_string = ''.join(random.choice(characters) for i in range(10))
  2310. tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
  2311. tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
  2312. if os.path.exists(tmp_user_data_folder):
  2313. try:
  2314. shutil.rmtree(tmp_user_data_folder)
  2315. except:
  2316. pass
  2317. print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
  2318. print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
  2319. if os.path.exists(absolute_user_data_folder):
  2320. try:
  2321. shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
  2322. print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
  2323. print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
  2324. except:
  2325. tmp_user_data_folder = absolute_user_data_folder
  2326. print("Copy user data folder failed, use the original folder.")
  2327. print("复制用户信息目录失败,使用原始目录。")
  2328. else:
  2329. tmp_user_data_folder = absolute_user_data_folder
  2330. print("Cannot find user data folder, create a new folder.")
  2331. print("未找到用户信息目录,创建新目录。")
  2332. options = tmp_options[i]["options"]
  2333. options.add_argument(
  2334. f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
  2335. options.add_argument("--profile-directory=Default")
  2336. print(
  2337. "如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally,说明有之前运行的Chrome实例没有正常关闭,请关闭之前打开的所有Chrome实例后再运行程序即可。")
  2338. print(
  2339. "If you get an error Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally, it means that there is a Chrome instance that was not closed properly before, please close all Chrome instances that were opened before running the program.")
  2340. threads = []
  2341. for i in range(len(c.ids)):
  2342. id = c.ids[i]
  2343. options = tmp_options[i]["options"]
  2344. print("id: ", id)
  2345. if c.read_type == "remote":
  2346. print("remote")
  2347. content = requests.get(
  2348. c.server_address + "/queryExecutionInstance?id=" + str(id))
  2349. service = json.loads(content.text) # 加载服务信息
  2350. else:
  2351. print("local")
  2352. local_folder = os.path.join(os.getcwd(), "execution_instances")
  2353. if sys.platform == "darwin":
  2354. user_folder = os.path.expanduser(
  2355. "~/Library/Application Support/EasySpider/")
  2356. local_folder = os.path.join(user_folder, "execution_instances")
  2357. file_path = os.path.join(local_folder, str(id) + ".json")
  2358. with open(file_path, 'r', encoding='utf-8') as f:
  2359. content = f.read()
  2360. service = json.loads(content) # 加载服务信息
  2361. try:
  2362. print("Task Name:", service["name"])
  2363. print("任务名称:", service["name"])
  2364. except:
  2365. print(f"Cannot find task with id: {str(id)}, please check whether {str(id)}.json exists in the 'execution_instances' folder.")
  2366. print(f"未找到id为{str(id)}的任务,请检查'execution_instances'文件夹中是否存在{str(id)}.json文件。")
  2367. continue
  2368. try:
  2369. cloudflare = service["cloudflare"]
  2370. except:
  2371. cloudflare = 0
  2372. if cloudflare == 0:
  2373. options.add_argument('log-level=3') # 隐藏日志
  2374. path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id), "files")
  2375. print("文件下载路径|File Download path:", path)
  2376. options.add_experimental_option("prefs", {
  2377. # 设置文件下载路径
  2378. "download.default_directory": path,
  2379. "download.prompt_for_download": False, # 禁止下载提示框
  2380. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  2381. "download.directory_upgrade": True,
  2382. "download.extensions_to_open": "applications/pdf",
  2383. "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
  2384. "safebrowsing_for_trusted_sources_enabled": False,
  2385. "safebrowsing.enabled": False,
  2386. 'safebrowsing.disable_download_protection': True,
  2387. 'profile.default_content_settings.popups': 0,
  2388. })
  2389. try:
  2390. if service["environment"] == 1:
  2391. options.add_experimental_option(
  2392. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  2393. except:
  2394. pass
  2395. try:
  2396. browser = service["browser"]
  2397. except:
  2398. browser = "chrome"
  2399. if browser == "chrome":
  2400. if c.docker_driver == "":
  2401. print("Using local driver")
  2402. selenium_service = Service(executable_path=driver_path)
  2403. browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
  2404. else:
  2405. print("Using remote driver")
  2406. # Use docker driver, default address is http://localhost:4444/wd/hub
  2407. # Headless mode
  2408. options.add_argument("--headless")
  2409. print("Headless mode")
  2410. browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
  2411. elif browser == "edge":
  2412. from selenium.webdriver.edge.service import Service as EdgeService
  2413. from selenium.webdriver.edge.options import Options as EdgeOptions
  2414. from myChrome import MyEdge
  2415. selenium_service = EdgeService(executable_path="msedgedriver.exe")
  2416. options = EdgeOptions()
  2417. options.use_chromium = True
  2418. options.add_argument("--ie-mode")
  2419. options.add_argument("ie.edgepath=msedge.exe")
  2420. browser_t = MyEdge(service=selenium_service, options=options)
  2421. elif cloudflare == 1:
  2422. if sys.platform == "win32":
  2423. options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
  2424. browser_t = MyUCChrome(
  2425. options=options, driver_executable_path=driver_path)
  2426. links = list(filter(isnotnull, service["links"].split("\n")))
  2427. # open page in new tab
  2428. browser_t.execute_script(
  2429. 'window.open("' + links[0] + '","_blank");')
  2430. time.sleep(5) # wait until page has loaded
  2431. browser_t.switch_to.window(
  2432. browser_t.window_handles[1]) # switch to new tab
  2433. # browser_t = uc.Chrome()
  2434. else:
  2435. print("Cloudflare模式只支持Windows x64平台。")
  2436. print(
  2437. "Cloudflare Mode only support on Windows x64 platform.")
  2438. sys.exit()
  2439. event = Event()
  2440. event.set()
  2441. thread = BrowserThread(browser_t, id, service,
  2442. c.version, event, c.saved_file_name, config=config, option=tmp_options[i])
  2443. print("Thread with task id: ", id, " is created")
  2444. threads.append(thread)
  2445. thread.start()
  2446. # Set the pause operation
  2447. # if sys.platform != "linux":
  2448. # time.sleep(3)
  2449. # Thread(target=check_pause, args=("p", event)).start()
  2450. # else:
  2451. time.sleep(3)
  2452. if c.pause_key == "p":
  2453. try:
  2454. pause_key = service["pauseKey"]
  2455. except:
  2456. pause_key = "p"
  2457. else:
  2458. pause_key = c.pause_key
  2459. press_time = {"duration": 0, "is_pressed": False, "pause_key": pause_key}
  2460. print("\n\n----------------------------------")
  2461. print(
  2462. "正在运行任务,长按键盘" + pause_key + "键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按" + pause_key + "键。")
  2463. print(
  2464. "Running task, long press '" + pause_key + "' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press '" + pause_key + "' again.")
  2465. print("----------------------------------\n\n")
  2466. # if cloudflare:
  2467. # print("过Cloudflare验证模式有时候会不稳定,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
  2468. # print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
  2469. # 使用监听器监听键盘输入
  2470. try:
  2471. from pynput.keyboard import Key, Listener
  2472. if c.keyboard:
  2473. with Listener(on_press=on_press_creator(press_time, event),
  2474. on_release=on_release_creator(event, press_time)) as listener:
  2475. listener.join()
  2476. except:
  2477. pass
  2478. # print("您的操作系统不支持暂停功能。")
  2479. # print("Your operating system does not support the pause function.")
  2480. for thread in threads:
  2481. print()
  2482. thread.join()