easyspider_executestage.py 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525
  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. from datetime import datetime
  4. import io # 遇到错误退出时应执行的代码
  5. import json
  6. # from lib2to3.pgen2 import driver
  7. import re
  8. # import shutil
  9. import subprocess
  10. import sys
  11. # from urllib import parse
  12. # import base64
  13. # import hashlib
  14. import time
  15. # import keyboard
  16. import requests
  17. from lxml import etree
  18. from selenium.webdriver.chrome.options import Options
  19. from selenium.webdriver.common.keys import Keys
  20. from selenium.webdriver.common.action_chains import ActionChains
  21. from selenium import webdriver
  22. from selenium.webdriver.support.ui import WebDriverWait
  23. from selenium.webdriver.support import expected_conditions as EC
  24. from selenium.webdriver.common.by import By
  25. from selenium.common.exceptions import NoSuchElementException
  26. from selenium.common.exceptions import TimeoutException
  27. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  28. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  29. from selenium.webdriver.support.ui import Select
  30. from selenium.webdriver import ActionChains
  31. from selenium.webdriver.common.by import By
  32. import undetected_chromedriver as uc
  33. import random
  34. # import pandas as pd
  35. from openpyxl import load_workbook, Workbook
  36. # import numpy
  37. import csv
  38. import os
  39. from commandline_config import Config
  40. import pytesseract
  41. from PIL import Image
  42. # import uuid
  43. from threading import Thread, Event
  44. from myChrome import MyChrome, MyUCChrome
  45. from utils import check_pause, download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, write_to_csv, write_to_excel
  46. desired_capabilities = DesiredCapabilities.CHROME
  47. desired_capabilities["pageLoadStrategy"] = "none"
  48. class BrowserThread(Thread):
  49. def __init__(self, browser_t, id, service, version, event, saveName, config):
  50. Thread.__init__(self)
  51. self.browser = browser_t
  52. self.config = config
  53. self.id = id
  54. self.event = event
  55. try:
  56. self.saveName = service["saveName"] # 保存文件的名字
  57. except:
  58. now = datetime.now()
  59. # 将时间格式化为精确到秒的字符串
  60. self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
  61. self.log = ""
  62. self.OUTPUT = ""
  63. self.SAVED = False
  64. self.BREAK = False
  65. # 名称设定
  66. if saveName != "": # 命令行覆盖保存名称
  67. self.saveName = saveName # 保存文件的名字
  68. now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  69. self.saveName = self.saveName.replace("current_time", now)
  70. print("Save Name for task ID", i, "is:", self.saveName)
  71. print("任务ID", i, "的保存文件名为:", self.saveName)
  72. if not os.path.exists("Data/Task_" + str(i)):
  73. os.mkdir("Data/Task_" + str(i))
  74. if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
  75. os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
  76. stealth_path = driver_path[:driver_path.find(
  77. "chromedriver")] + "stealth.min.js"
  78. with open(stealth_path, 'r') as f:
  79. js = f.read()
  80. print("Loading stealth.min.js")
  81. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  82. 'source': js}) # TMALL 反扒
  83. WebDriverWait(self.browser, 10)
  84. self.browser.get('about:blank')
  85. self.procedure = service["graph"] # 程序执行流程
  86. try:
  87. self.maxViewLength = service["maxViewLength"] # 最大显示长度
  88. except:
  89. self.maxViewLength = 15
  90. try:
  91. self.outputFormat = service["outputFormat"] # 输出格式
  92. except:
  93. self.outputFormat = "csv"
  94. try:
  95. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  96. pass
  97. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  98. if service["version"] != version:
  99. print("版本不一致,请使用" +
  100. service["version"] + "版本的EasySpider运行该任务!")
  101. print("Version not match, please use EasySpider " +
  102. service["version"] + " to run this task!")
  103. self.browser.quit()
  104. sys.exit()
  105. except: # 0.2.0版本没有version字段,所以直接退出
  106. print("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  107. print("Version not match, please use EasySpider v0.2.0 to run this task!")
  108. self.browser.quit()
  109. sys.exit()
  110. try:
  111. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  112. except:
  113. self.save_threshold = 10
  114. self.links = list(
  115. filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
  116. self.OUTPUT = [] # 采集的数据
  117. self.writeMode = 1 # 写入模式,0为新建,1为追加
  118. if self.outputFormat == "csv" or self.outputFormat == "txt":
  119. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  120. self.OUTPUT.append([]) # 添加表头
  121. self.writeMode = 0
  122. elif self.outputFormat == "xlsx":
  123. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
  124. self.OUTPUT.append([]) # 添加表头
  125. self.writeMode = 0
  126. elif self.outputFormat == "mysql":
  127. self.mysql = myMySQL(config["mysql_config_path"])
  128. self.mysql.create_table(self.saveName, service["outputParameters"])
  129. self.writeMode = 2
  130. if self.writeMode == 1:
  131. print("追加模式")
  132. print("Append Mode")
  133. elif self.writeMode == 0:
  134. print("新建模式")
  135. print("New Mode")
  136. elif self.writeMode == 2:
  137. print("MySQL模式")
  138. print("MySQL Mode")
  139. self.containJudge = service["containJudge"] # 是否含有判断语句
  140. self.outputParameters = {}
  141. self.outputParametersTypes = []
  142. self.outputParametersRecord = [] # 字段是否被记录
  143. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  144. self.log = "" # 记下现在总共开了多少个标签页
  145. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  146. self.SAVED = False # 记录是否已经存储了
  147. for para in service["outputParameters"]: # 初始化输出参数
  148. if para["name"] not in self.outputParameters.keys():
  149. self.outputParameters[para["name"]] = ""
  150. self.dataNotFoundKeys[para["name"]] = False
  151. try:
  152. self.outputParametersTypes.append(para["type"])
  153. except:
  154. self.outputParametersTypes.append("text")
  155. try:
  156. self.outputParametersRecord.append(bool(para["recordASField"]))
  157. except:
  158. self.outputParametersRecord.append(True)
  159. # 文件叠加的时候不添加表头
  160. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  161. if self.writeMode == 0:
  162. self.OUTPUT[0].append(para["name"])
  163. self.urlId = 0 # 全局记录变量
  164. self.preprocess() # 预处理,优化提取数据流程
  165. # 检测如果没有复杂的操作,优化提取数据流程
  166. def preprocess(self):
  167. for node in self.procedure:
  168. try:
  169. iframe = node["parameters"]["iframe"]
  170. except:
  171. node["parameters"]["iframe"] = False
  172. try:
  173. node["parameters"]["xpath"] = lowercase_tags_in_xpath(
  174. node["parameters"]["xpath"])
  175. except:
  176. pass
  177. if node["option"] == 1: # 打开网页操作
  178. try:
  179. cookies = node["parameters"]["cookies"]
  180. except:
  181. node["parameters"]["cookies"] = ""
  182. if node["option"] == 3: # 提取数据操作
  183. paras = node["parameters"]["paras"]
  184. for para in paras:
  185. try:
  186. iframe = para["iframe"]
  187. except:
  188. para["iframe"] = False
  189. try:
  190. para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
  191. except:
  192. pass
  193. if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
  194. para["optimizable"] = True
  195. else:
  196. para["optimizable"] = False
  197. def run(self):
  198. # 挨个执行程序
  199. for i in range(len(self.links)):
  200. print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
  201. print("Executing link", i + 1, "/ ", len(self.links))
  202. self.executeNode(0)
  203. self.urlId = self.urlId + 1
  204. files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  205. # 如果目录为空,则删除该目录
  206. if not files:
  207. os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  208. print("Done!")
  209. print("执行完成!")
  210. self.recordLog("Done!")
  211. self.saveData(exit=True)
  212. if self.outputFormat == "mysql":
  213. self.mysql.close()
  214. def recordLog(self, str=""):
  215. self.log = self.log + str + "\n"
  216. # 控制台打印log函数
  217. def Log(self, text, text2=""):
  218. switch = False
  219. if switch:
  220. print(text, text2)
  221. # @atexit.register
  222. # def clean(self):
  223. # self.saveData(exit=True)
  224. # self.browser.quit()
  225. # sys.exit(0)
  226. def saveData(self, exit=False):
  227. # 每save_threshold条保存一次
  228. if exit == True or len(self.OUTPUT) >= self.save_threshold:
  229. # 写入日志
  230. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
  231. file_obj.write(self.log)
  232. file_obj.close()
  233. # 写入数据
  234. if self.outputFormat == "csv" or self.outputFormat == "txt":
  235. file_name = "Data/Task_" + \
  236. str(self.id) + "/" + self.saveName + '.' + self.outputFormat
  237. write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
  238. elif self.outputFormat == "xlsx":
  239. file_name = "Data/Task_" + \
  240. str(self.id) + "/" + self.saveName + '.xlsx'
  241. write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
  242. elif self.outputFormat == "mysql":
  243. self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
  244. self.OUTPUT = []
  245. self.log = ""
  246. def scrollDown(self, para, rt=""):
  247. try:
  248. time.sleep(para["scrollWaitTime"]) # 下拉前等待
  249. except:
  250. pass
  251. scrollType = int(para["scrollType"])
  252. try:
  253. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  254. for i in range(para["scrollCount"]):
  255. self.Log("Wait for set second after screen scrolling")
  256. body = self.browser.find_element(
  257. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  258. if scrollType == 1:
  259. body.send_keys(Keys.PAGE_DOWN)
  260. elif scrollType == 2:
  261. body.send_keys(Keys.END)
  262. try:
  263. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  264. except:
  265. pass
  266. except:
  267. self.Log('Time out after set seconds when scrolling. ')
  268. self.recordLog('Time out after set seconds when scrolling')
  269. self.browser.execute_script('window.stop()')
  270. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  271. for i in range(para["scrollCount"]):
  272. self.Log("Wait for set second after screen scrolling")
  273. body = self.browser.find_element(
  274. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  275. if scrollType == 1:
  276. body.send_keys(Keys.PGDN)
  277. elif scrollType == 2:
  278. body.send_keys(Keys.END)
  279. try:
  280. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  281. except:
  282. pass
  283. if rt != "":
  284. rt.end()
  285. def execute_code(self, codeMode, code, max_wait_time, element=None, iframe=False):
  286. output = ""
  287. if code == "":
  288. return ""
  289. if max_wait_time == 0:
  290. max_wait_time = 999999
  291. # print(codeMode, code)
  292. # 将value中的Field[""]替换为outputParameters中的键值
  293. pattern = r'Field\["([^"]+)"\]'
  294. try:
  295. replaced_text = re.sub(
  296. pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
  297. except:
  298. replaced_text = code
  299. code = replaced_text
  300. if iframe and self.browser.iframe_env == False:
  301. # 获取所有的 iframe
  302. self.browser.switch_to.default_content()
  303. iframes = self.browser.find_elements(
  304. By.CSS_SELECTOR, "iframe", iframe=False)
  305. # 遍历所有的 iframe 并点击里面的元素
  306. for iframe in iframes:
  307. # 切换到 iframe
  308. try:
  309. self.browser.switch_to.default_content()
  310. self.browser.switch_to.frame(iframe)
  311. self.browser.iframe_env = True
  312. break
  313. except:
  314. print("Iframe switch failed")
  315. elif not iframe and self.browser.iframe_env == True:
  316. self.browser.switch_to.default_content()
  317. self.browser.iframe_env = False
  318. if int(codeMode) == 0:
  319. self.recordLog("Execute JavaScript:" + code)
  320. self.recordLog("执行JavaScript:" + code)
  321. self.browser.set_script_timeout(max_wait_time)
  322. try:
  323. output = self.browser.execute_script(code)
  324. except:
  325. output = ""
  326. self.recordLog("JavaScript execution failed")
  327. elif int(codeMode) == 2:
  328. self.recordLog("Execute JavaScript for element:" + code)
  329. self.recordLog("对元素执行JavaScript:" + code)
  330. self.browser.set_script_timeout(max_wait_time)
  331. try:
  332. output = self.browser.execute_script(code, element)
  333. except:
  334. output = ""
  335. self.recordLog("JavaScript execution failed")
  336. elif int(codeMode) == 1:
  337. self.recordLog("Execute System Call:" + code)
  338. self.recordLog("执行系统命令:" + code)
  339. # 执行系统命令
  340. try:
  341. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  342. output = subprocess.run(
  343. code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  344. # 输出命令返回值
  345. output = output.stdout
  346. print(output)
  347. except subprocess.TimeoutExpired:
  348. # 命令执行时间超过指定值,抛出异常
  349. self.recordLog("Command timed out")
  350. self.recordLog("命令执行超时")
  351. except Exception as e:
  352. print(e) # 打印异常信息
  353. self.recordLog("Command execution failed")
  354. self.recordLog("命令执行失败")
  355. return str(output)
  356. def customOperation(self, node, loopValue, loopPath, index):
  357. paras = node["parameters"]
  358. codeMode = int(paras["codeMode"])
  359. code = paras["code"]
  360. output = ""
  361. max_wait_time = int(paras["waitTime"])
  362. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  363. try:
  364. elements = self.browser.find_elements(
  365. By.XPATH, loopPath, iframe=paras["iframe"])
  366. element = elements[index]
  367. output = self.execute_code(
  368. codeMode, code, max_wait_time, element, iframe=paras["iframe"])
  369. except:
  370. output = ""
  371. print("JavaScript execution failed")
  372. elif codeMode == 3:
  373. self.BREAK = True
  374. else: # 0 1
  375. output = self.execute_code(
  376. codeMode, code, max_wait_time, iframe=paras["iframe"])
  377. recordASField = bool(paras["recordASField"])
  378. if recordASField:
  379. print("操作<" + node["title"] + ">的返回值为:" + output)
  380. print("The return value of operation <" + node["title"] + "> is: " + output)
  381. self.outputParameters[node["title"]] = output
  382. if recordASField:
  383. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  384. self.OUTPUT.append(line)
  385. def switchSelect(self, para, loopValue):
  386. optionMode = int(para["optionMode"])
  387. optionValue = para["optionValue"]
  388. try:
  389. dropdown = Select(self.browser.find_element(
  390. By.XPATH, para["xpath"], iframe=para["iframe"]))
  391. try:
  392. if optionMode == 0:
  393. # 获取当前选中的选项索引
  394. current_index = dropdown.options.index(
  395. dropdown.first_selected_option)
  396. # 计算下一个选项的索引
  397. next_index = (current_index + 1) % len(dropdown.options)
  398. # 选择下一个选项
  399. dropdown.select_by_index(next_index)
  400. elif optionMode == 1:
  401. dropdown.select_by_index(int(optionValue))
  402. elif optionMode == 2:
  403. dropdown.select_by_value(optionValue)
  404. elif optionMode == 3:
  405. dropdown.select_by_visible_text(optionValue)
  406. except:
  407. print("切换下拉框选项失败:", para["xpath"],
  408. para["optionMode"], para["optionValue"])
  409. print("Failed to change drop-down box option:",
  410. para["xpath"], para["optionMode"], para["optionValue"])
  411. except:
  412. print("找不到下拉框元素:", para["xpath"])
  413. print("Cannot find drop-down box element:", para["xpath"])
  414. def moveToElement(self, para, loopElement=None, loopPath="", index=0):
  415. time.sleep(0.1) # 移动之前等待0.1秒
  416. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  417. path = loopPath
  418. # element = loopElement
  419. else:
  420. index = 0
  421. path = para["xpath"] # 不然使用元素定义的xpath
  422. # element = self.browser.find_element(
  423. # By.XPATH, path, iframe=para["iframe"])
  424. try:
  425. elements = self.browser.find_elements(
  426. By.XPATH, path, iframe=para["iframe"])
  427. element = elements[index]
  428. try:
  429. ActionChains(self.browser).move_to_element(element).perform()
  430. except:
  431. print("移动鼠标到元素失败:", para["xpath"])
  432. print("Failed to move mouse to element:", para["xpath"])
  433. except:
  434. print("找不到元素:", para["xpath"])
  435. print("Cannot find element:", para["xpath"])
  436. # 执行节点关键函数部分
  437. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  438. node = self.procedure[nodeId]
  439. WebDriverWait(self.browser, 10).until
  440. # 等待元素出现才进行操作,10秒内未出现则报错
  441. (EC.visibility_of_element_located(
  442. (By.XPATH, node["parameters"]["xpath"])))
  443. # 根据不同选项执行不同操作
  444. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  445. for i in node["sequence"]: # 从根节点开始向下读取
  446. self.executeNode(i, loopValue, loopPath, index)
  447. elif node["option"] == 1: # 打开网页操作
  448. self.recordLog("openPage")
  449. self.openPage(node["parameters"], loopValue)
  450. elif node["option"] == 2: # 点击元素
  451. self.recordLog("Click")
  452. self.clickElement(node["parameters"], loopValue, loopPath, index)
  453. elif node["option"] == 3: # 提取数据
  454. self.recordLog("getData")
  455. self.getData(node["parameters"], loopValue, node["isInLoop"],
  456. parentPath=loopPath, index=index)
  457. self.saveData()
  458. elif node["option"] == 4: # 输入文字
  459. self.inputInfo(node["parameters"], loopValue)
  460. elif node["option"] == 5: # 自定义操作
  461. self.customOperation(node, loopValue, loopPath, index)
  462. self.saveData()
  463. elif node["option"] == 6: # 切换下拉框
  464. self.switchSelect(node["parameters"], loopValue)
  465. elif node["option"] == 7: # 鼠标移动到元素上
  466. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  467. elif node["option"] == 8: # 循环
  468. self.recordLog("loop")
  469. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  470. elif node["option"] == 9: # 条件分支
  471. self.recordLog("judge")
  472. self.judgeExecute(node, loopValue, loopPath, index)
  473. # 执行完之后进行等待
  474. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  475. waitTime = 0.01 # 默认等待0.01秒
  476. if node["parameters"]["wait"] >= 0:
  477. waitTime = node["parameters"]["wait"]
  478. try:
  479. waitType = int(node["parameters"]["waitType"])
  480. except:
  481. waitType = 0
  482. if waitType == 0: # 固定等待时间
  483. time.sleep(waitTime)
  484. elif waitType == 1: # 随机等待时间
  485. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  486. self.Log("Wait seconds after node executing: ", waitTime)
  487. self.event.wait() # 等待事件结束
  488. # 对判断条件的处理
  489. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  490. executeBranchId = 0 # 要执行的BranchId
  491. for i in node["sequence"]:
  492. cnode = self.procedure[i] # 获得条件分支
  493. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  494. if tType == 0: # 什么条件都没有
  495. executeBranchId = i
  496. break
  497. elif tType == 1: # 当前页面包含文本
  498. try:
  499. bodyText = self.browser.find_element(
  500. By.CSS_SELECTOR, "body", iframe=cnode["parameters"]["iframe"]).text
  501. if bodyText.find(cnode["parameters"]["value"]) >= 0:
  502. executeBranchId = i
  503. break
  504. except: # 找不到元素下一个条件
  505. continue
  506. elif tType == 2: # 当前页面包含元素
  507. try:
  508. if self.browser.find_element(By.XPATH, cnode["parameters"]["value"], iframe=cnode["parameters"]["iframe"]):
  509. executeBranchId = i
  510. break
  511. except: # 找不到元素或者xpath写错了,下一个条件
  512. continue
  513. elif tType == 3: # 当前循环元素包括文本
  514. try:
  515. if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
  516. executeBranchId = i
  517. break
  518. except: # 找不到元素或者xpath写错了,下一个条件
  519. continue
  520. elif tType == 4: # 当前循环元素包括元素
  521. try:
  522. if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
  523. executeBranchId = i
  524. break
  525. except: # 找不到元素或者xpath写错了,下一个条件
  526. continue
  527. elif tType <= 7: # JS命令返回值
  528. if tType == 5: # JS命令返回值等于
  529. output = self.execute_code(
  530. 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  531. elif tType == 6: # System
  532. output = self.execute_code(
  533. 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  534. elif tType == 7: # 针对当前循环项的JS命令返回值
  535. output = self.execute_code(
  536. 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"])
  537. try:
  538. if output.find("rue") != -1: # 如果返回值中包含true
  539. code = 1
  540. else:
  541. code = int(output)
  542. except:
  543. code = 0
  544. if code > 0:
  545. executeBranchId = i
  546. break
  547. # rt.end()
  548. if executeBranchId != 0:
  549. self.executeNode(executeBranchId, loopElement, clickPath, index)
  550. # 对循环的处理
  551. def loopExecute(self, node, loopValue, clickPath="", index=0):
  552. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  553. # self.Log("循环执行前等待0.1秒")
  554. self.Log("Wait 0.1 second before loop")
  555. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  556. thisHistoryLength = self.browser.execute_script(
  557. 'return history.length') # 记录本次循环内的history的length
  558. self.history["index"] = thisHistoryLength
  559. self.history["handle"] = thisHandle
  560. if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  561. # 无跳转标签页操作
  562. count = 0 # 执行次数
  563. while True: # do while循环
  564. try:
  565. finished = False
  566. element = self.browser.find_element(
  567. By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  568. for i in node["sequence"]: # 挨个执行操作
  569. self.executeNode(
  570. i, element, node["parameters"]["xpath"], 0)
  571. if self.BREAK: # 如果有break操作,下面的操作不执行
  572. break
  573. if self.BREAK: # 如果有break操作,退出循环
  574. self.BREAK = False
  575. finished = True
  576. break
  577. finished = True
  578. self.Log("Click: ", node["parameters"]["xpath"])
  579. self.recordLog("Click:" + node["parameters"]["xpath"])
  580. except NoSuchElementException:
  581. # except:
  582. print("Single loop element not found: ",
  583. node["parameters"]["xpath"])
  584. print("找不到要循环的单个元素: ", node["parameters"]["xpath"])
  585. self.recordLog(
  586. "Single loop element not found: " + node["parameters"]["xpath"])
  587. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  588. if node["option"] != 2:
  589. self.executeNode(
  590. i, None, node["parameters"]["xpath"], 0)
  591. finished = True
  592. break # 如果找不到元素,退出循环
  593. finally:
  594. if not finished:
  595. print("\n\n-------Retrying-------\n\n")
  596. self.Log("-------Retrying-------: ",
  597. node["parameters"]["xpath"])
  598. self.recordLog("ClickNotFound:" +
  599. node["parameters"]["xpath"])
  600. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  601. if node["option"] != 2:
  602. self.executeNode(
  603. i, None, node["parameters"]["xpath"], 0)
  604. break # 如果找不到元素,退出循环
  605. count = count + 1
  606. self.Log("Page: ", count)
  607. self.recordLog("Page:" + str(count))
  608. # print(node["parameters"]["exitCount"], "-------")
  609. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  610. break
  611. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  612. output = self.execute_code(int(
  613. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  614. code = get_output_code(output)
  615. if code <= 0:
  616. break
  617. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  618. try:
  619. elements = self.browser.find_elements(By.XPATH,
  620. node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  621. if len(elements) == 0:
  622. print("Loop element not found: ",
  623. node["parameters"]["xpath"])
  624. print("找不到循环元素: ", node["parameters"]["xpath"])
  625. self.recordLog("pathNotFound: " +
  626. node["parameters"]["xpath"])
  627. for index in range(len(elements)):
  628. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  629. self.executeNode(i, elements[index],
  630. node["parameters"]["xpath"], index)
  631. if self.BREAK:
  632. break
  633. if self.BREAK:
  634. self.BREAK = False
  635. break
  636. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  637. while True: # 一直关闭窗口直到当前标签页
  638. self.browser.close() # 关闭使用完的标签页
  639. self.browser.switch_to.window(
  640. self.browser.window_handles[-1])
  641. if self.browser.current_window_handle == thisHandle:
  642. break
  643. if self.history["index"] != thisHistoryLength and self.history[
  644. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  645. difference = thisHistoryLength - \
  646. self.history["index"] # 计算历史记录变化差值
  647. self.browser.execute_script(
  648. 'history.go(' + str(difference) + ')') # 回退历史记录
  649. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  650. time.sleep(node["parameters"]["historyWait"])
  651. # else:
  652. # time.sleep(2)
  653. # 切换历史记录等待:
  654. self.Log("Change history back time or:",
  655. node["parameters"]["historyWait"])
  656. self.browser.execute_script('window.stop()')
  657. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  658. output = self.execute_code(int(
  659. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  660. code = get_output_code(output)
  661. if code <= 0:
  662. break
  663. except NoSuchElementException:
  664. print("Loop element not found: ", node["parameters"]["xpath"])
  665. print("找不到循环元素: ", node["parameters"]["xpath"])
  666. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  667. except Exception as e:
  668. raise
  669. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  670. # 千万不要忘了分割!!
  671. for path in node["parameters"]["pathList"].split("\n"):
  672. try:
  673. element = self.browser.find_element(
  674. By.XPATH, path, iframe=node["parameters"]["iframe"])
  675. for i in node["sequence"]: # 挨个执行操作
  676. self.executeNode(i, element, path, 0)
  677. if self.BREAK:
  678. break
  679. if self.BREAK:
  680. self.BREAK = False
  681. break
  682. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  683. while True: # 一直关闭窗口直到当前标签页
  684. self.browser.close() # 关闭使用完的标签页
  685. self.browser.switch_to.window(
  686. self.browser.window_handles[-1])
  687. if self.browser.current_window_handle == thisHandle:
  688. break
  689. if self.history["index"] != thisHistoryLength and self.history[
  690. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  691. difference = thisHistoryLength - \
  692. self.history["index"] # 计算历史记录变化差值
  693. self.browser.execute_script(
  694. 'history.go(' + str(difference) + ')') # 回退历史记录
  695. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  696. time.sleep(node["parameters"]["historyWait"])
  697. # else:
  698. # time.sleep(2)
  699. self.Log("Change history back time or:",
  700. node["parameters"]["historyWait"])
  701. self.browser.execute_script('window.stop()')
  702. except NoSuchElementException:
  703. print("Loop element not found: ", path)
  704. print("找不到循环元素: ", path)
  705. self.recordLog("pathNotFound: " + path)
  706. continue # 循环中找不到元素就略过操作
  707. except Exception as e:
  708. raise
  709. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  710. output = self.execute_code(int(
  711. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  712. code = get_output_code(output)
  713. if code <= 0:
  714. break
  715. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  716. textList = node["parameters"]["textList"].split("\n")
  717. for text in textList:
  718. self.recordLog("input: " + text)
  719. for i in node["sequence"]: # 挨个执行操作
  720. self.executeNode(i, text, "", 0)
  721. if self.BREAK:
  722. break
  723. if self.BREAK:
  724. self.BREAK = False
  725. break
  726. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  727. output = self.execute_code(int(
  728. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  729. code = get_output_code(output)
  730. if code <= 0:
  731. break
  732. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  733. # tempList = node["parameters"]["textList"].split("\r\n")
  734. urlList = list(
  735. filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
  736. # urlList = []
  737. # for url in tempList:
  738. # if url != "":
  739. # urlList.append(url)
  740. for url in urlList:
  741. self.recordLog("input: " + url)
  742. for i in node["sequence"]:
  743. self.executeNode(i, url, "", 0)
  744. if self.BREAK:
  745. break
  746. if self.BREAK:
  747. self.BREAK = False
  748. break
  749. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  750. output = self.execute_code(int(
  751. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  752. code = get_output_code(output)
  753. if code <= 0:
  754. break
  755. elif int(node["parameters"]["loopType"]) <= 6: # 命令返回值
  756. while True: # do while循环
  757. if int(node["parameters"]["loopType"]) == 5: # JS
  758. output = self.execute_code(
  759. 0, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  760. elif int(node["parameters"]["loopType"]) == 6: # System
  761. output = self.execute_code(
  762. 1, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  763. code = get_output_code(output)
  764. if code <= 0:
  765. break
  766. for i in node["sequence"]: # 挨个执行操作
  767. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  768. if self.BREAK:
  769. break
  770. if self.BREAK:
  771. self.BREAK = False
  772. break
  773. self.history["index"] = thisHistoryLength
  774. self.history["handle"] = self.browser.current_window_handle
  775. self.scrollDown(node["parameters"])
  776. # 打开网页事件
  777. def openPage(self, para, loopValue):
  778. time.sleep(1) # 打开网页后强行等待至少1秒
  779. if len(self.browser.window_handles) > 1:
  780. self.browser.switch_to.window(
  781. self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  782. self.browser.close()
  783. self.browser.switch_to.window(
  784. self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  785. self.history["handle"] = self.browser.current_window_handle
  786. if para["useLoop"]:
  787. url = loopValue
  788. elif para["url"] != "about:blank":
  789. url = self.links[self.urlId]
  790. # clear output parameters
  791. for key in self.outputParameters:
  792. self.outputParameters[key] = ""
  793. else:
  794. url = list(filter(isnull, para["links"].split("\n")))[0]
  795. # 将value中的Field[""]替换为outputParameters中的键值
  796. pattern = r'Field\["([^"]+)"\]'
  797. try:
  798. replaced_text = re.sub(
  799. pattern, lambda match: self.outputParameters.get(match.group(1), ''), url)
  800. except:
  801. replaced_text = url
  802. url = replaced_text
  803. try:
  804. maxWaitTime = int(para["maxWaitTime"])
  805. except:
  806. maxWaitTime = 10 # 默认最大等待时间为10秒
  807. try:
  808. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  809. self.browser.set_script_timeout(maxWaitTime)
  810. self.browser.get(url)
  811. if para["cookies"] != "":
  812. self.browser.delete_all_cookies() # 清除所有已有cookie
  813. cookies = para["cookies"].split('\n')
  814. for cookie in cookies:
  815. name, value = cookie.split('=', 1)
  816. cookie_dict = {'name': name, 'value': value}
  817. # 加载 cookie
  818. self.browser.add_cookie(cookie_dict)
  819. self.Log('Loading page: ' + url)
  820. self.recordLog('Loading page: ' + url)
  821. except TimeoutException:
  822. self.Log('Time out after set seconds when loading page: ' + url)
  823. self.recordLog(
  824. 'Time out after set seconds when loading page: ' + url)
  825. try:
  826. self.browser.execute_script('window.stop()')
  827. except:
  828. pass
  829. except Exception as e:
  830. print("Failed to load page: " + url)
  831. self.recordLog('Failed to load page: ' + url)
  832. try:
  833. self.history["index"] = self.browser.execute_script(
  834. "return history.length")
  835. except TimeoutException:
  836. try:
  837. self.browser.execute_script('window.stop()')
  838. self.history["index"] = self.browser.execute_script(
  839. "return history.length")
  840. except:
  841. self.history["index"] = 0
  842. self.scrollDown(para) # 控制屏幕向下滚动
  843. # 键盘输入事件
  844. def inputInfo(self, para, loopValue):
  845. time.sleep(0.1) # 输入之前等待0.1秒
  846. self.Log("Wait 0.1 second before input")
  847. try:
  848. textbox = self.browser.find_element(
  849. By.XPATH, para["xpath"], iframe=para["iframe"])
  850. # textbox.send_keys(Keys.CONTROL, 'a')
  851. # textbox.send_keys(Keys.BACKSPACE)
  852. self.execute_code(
  853. 2, para["beforeJS"], para["beforeJSWaitTime"], textbox, iframe=para["iframe"]) # 执行前置JS
  854. # Send the HOME key
  855. textbox.send_keys(Keys.HOME)
  856. # Send the SHIFT + END key combination
  857. textbox.send_keys(Keys.SHIFT, Keys.END)
  858. # Send the DELETE key
  859. textbox.send_keys(Keys.DELETE)
  860. value = ""
  861. if para["useLoop"]:
  862. value = loopValue
  863. else:
  864. value = para["value"]
  865. # 将value中的Field[""]替换为outputParameters中的键值
  866. pattern = r'Field\["([^"]+)"\]'
  867. try:
  868. replaced_text = re.sub(
  869. pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  870. replaced_text = re.sub(
  871. '<enter>', '', replaced_text, flags=re.IGNORECASE)
  872. except:
  873. replaced_text = value
  874. textbox.send_keys(replaced_text)
  875. if value.lower().find("<enter>") >= 0:
  876. textbox.send_keys(Keys.ENTER)
  877. self.execute_code(
  878. 2, para["afterJS"], para["afterJSWaitTime"], textbox, iframe=para["iframe"]) # 执行后置js
  879. except:
  880. print("Cannot find input box element:" +
  881. para["xpath"] + ", please try to set the wait time before executing this operation")
  882. print("找不到输入框元素:" + para["xpath"] + ",请尝试在执行此操作前设置等待时间")
  883. self.recordLog("Cannot find input box element:" +
  884. para["xpath"] + "Please try to set the wait time before executing this operation")
  885. # 点击元素事件
  886. def clickElement(self, para, loopElement=None, clickPath="", index=0):
  887. try:
  888. maxWaitTime = int(para["maxWaitTime"])
  889. except:
  890. maxWaitTime = 10
  891. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  892. self.browser.set_script_timeout(maxWaitTime)
  893. # 点击前对该元素执行一段JavaScript代码
  894. try:
  895. # element = self.browser.find_element(
  896. # By.XPATH, path, iframe=para["iframe"])
  897. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  898. path = clickPath
  899. # element = loopElement
  900. else:
  901. index = 0
  902. path = para["xpath"] # 不然使用元素定义的xpath
  903. # element = self.browser.find_element(
  904. # By.XPATH, path, iframe=para["iframe"])
  905. elements = self.browser.find_elements(
  906. By.XPATH, path, iframe=para["iframe"])
  907. element = elements[index]
  908. if para["beforeJS"] != "":
  909. self.execute_code(2, para["beforeJS"],
  910. para["beforeJSWaitTime"], element, iframe=para["iframe"])
  911. except:
  912. print("Cannot find element:" +
  913. path + ", please try to set the wait time before executing this operation")
  914. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  915. self.recordLog("Cannot find element:" +
  916. path + ", please try to set the wait time before executing this operation")
  917. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  918. try:
  919. click_way = int(para["clickWay"])
  920. except:
  921. click_way = 0
  922. try:
  923. if click_way == 0: # 用selenium的点击方法
  924. actions = ActionChains(self.browser) # 实例化一个action对象
  925. actions.click(element).perform()
  926. elif click_way == 1: # 用js的点击方法
  927. script = 'var result = document.evaluate(`' + path + \
  928. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  929. self.browser.execute_script(script, str(index)) # 用js的点击方法
  930. except TimeoutException:
  931. self.Log('Time out after set seconds when loading clicked page')
  932. self.recordLog(
  933. 'Time out after set seconds when loading clicked page')
  934. try:
  935. self.browser.execute_script('window.stop()')
  936. except:
  937. pass
  938. except Exception as e:
  939. self.Log(e)
  940. self.recordLog(str(e))
  941. # 点击后对该元素执行一段JavaScript代码
  942. try:
  943. if para["afterJS"] != "":
  944. element = self.browser.find_element(
  945. By.XPATH, path, iframe=para["iframe"])
  946. self.execute_code(2, para["afterJS"],
  947. para["afterJSWaitTime"], element, iframe=para["iframe"])
  948. except:
  949. print("Cannot find element:" + path)
  950. self.recordLog("Cannot find element:" +
  951. path + ", please try to set the wait time before executing this operation")
  952. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  953. waitTime = float(para["wait"]) + 0.01 # 点击之后等待
  954. try:
  955. waitType = int(para["waitType"])
  956. except:
  957. waitType = 0
  958. if waitType == 0: # 固定等待时间
  959. time.sleep(waitTime)
  960. elif waitType == 1: # 随机等待时间
  961. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  962. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  963. self.browser.switch_to.window(
  964. self.browser.window_handles[-1]) # 跳转到新的标签页
  965. self.history["handle"] = self.browser.current_window_handle
  966. try:
  967. self.history["index"] = self.browser.execute_script(
  968. "return history.length")
  969. except TimeoutException:
  970. self.browser.execute_script('window.stop()')
  971. self.history["index"] = self.browser.execute_script(
  972. "return history.length")
  973. else:
  974. try:
  975. self.history["index"] = self.browser.execute_script(
  976. "return history.length")
  977. except TimeoutException:
  978. self.browser.execute_script('window.stop()')
  979. self.history["index"] = self.browser.execute_script(
  980. "return history.length")
  981. # 如果打开了新窗口,切换到新窗口
  982. self.scrollDown(para) # 根据参数配置向下滚动
  983. # rt.end()
  984. def get_content(self, p, element):
  985. content = ""
  986. if p["contentType"] == 0:
  987. # 先处理特殊节点类型
  988. if p["nodeType"] == 2:
  989. if element.get_attribute("href") != None:
  990. content = element.get_attribute("href")
  991. else:
  992. content = ""
  993. elif p["nodeType"] == 3:
  994. if element.get_attribute("value") != None:
  995. content = element.get_attribute("value")
  996. else:
  997. content = ""
  998. elif p["nodeType"] == 4: # 图片
  999. if element.get_attribute("src") != None:
  1000. content = element.get_attribute("src")
  1001. else:
  1002. content = ""
  1003. try:
  1004. downloadPic = p["downloadPic"]
  1005. except:
  1006. downloadPic = 0
  1007. if downloadPic == 1:
  1008. download_image(content, "Data/Task_" +
  1009. str(self.id) + "/" + self.saveName + "/")
  1010. else: # 普通节点
  1011. content = element.text
  1012. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  1013. if p["nodeType"] == 2:
  1014. if element.get_attribute("href") != None:
  1015. content = element.get_attribute("href")
  1016. else:
  1017. content = ""
  1018. elif p["nodeType"] == 3:
  1019. if element.get_attribute("value") != None:
  1020. content = element.get_attribute("value")
  1021. else:
  1022. content = ""
  1023. elif p["nodeType"] == 4: # 图片
  1024. if element.get_attribute("src") != None:
  1025. content = element.get_attribute("src")
  1026. else:
  1027. content = ""
  1028. try:
  1029. downloadPic = p["downloadPic"]
  1030. except:
  1031. downloadPic = 0
  1032. if downloadPic == 1:
  1033. download_image(content, "Data/Task_" +
  1034. str(self.id) + "/" + self.saveName + "/")
  1035. else:
  1036. command = 'var arr = [];\
  1037. var content = arguments[0];\
  1038. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  1039. if(content.childNodes[i].nodeType === 3){ \
  1040. arr.push(content.childNodes[i].nodeValue);\
  1041. }\
  1042. }\
  1043. var str = arr.join(" "); \
  1044. return str;'
  1045. content = self.browser.execute_script(command, element).replace(
  1046. "\n", "").replace("\\s+", " ")
  1047. elif p["contentType"] == 2:
  1048. content = element.get_attribute('innerHTML')
  1049. elif p["contentType"] == 3:
  1050. content = element.get_attribute('outerHTML')
  1051. elif p["contentType"] == 4:
  1052. # 获取元素的背景图片地址
  1053. bg_url = element.value_of_css_property('background-image')
  1054. # 清除背景图片地址中的多余字符
  1055. bg_url = bg_url.replace('url("', '').replace('")', '')
  1056. content = bg_url
  1057. elif p["contentType"] == 5:
  1058. content = self.browser.current_url
  1059. elif p["contentType"] == 6:
  1060. content = self.browser.title
  1061. elif p["contentType"] == 7:
  1062. # 获取整个网页的高度和宽度
  1063. height = self.browser.execute_script(
  1064. "return document.body.scrollHeight")
  1065. width = self.browser.execute_script(
  1066. "return document.body.scrollWidth")
  1067. # 调整浏览器窗口的大小
  1068. self.browser.set_window_size(width, height)
  1069. element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
  1070. "/" + str(time.time()) + ".png")
  1071. elif p["contentType"] == 8:
  1072. try:
  1073. screenshot = element.screenshot_as_png
  1074. screenshot_stream = io.BytesIO(screenshot)
  1075. # 使用Pillow库打开截图,并转换为灰度图像
  1076. image = Image.open(screenshot_stream).convert('L')
  1077. # 使用Tesseract OCR引擎识别图像中的文本
  1078. text = pytesseract.image_to_string(image, lang='chi_sim+eng')
  1079. content = text
  1080. except Exception as e:
  1081. content = "OCR Error"
  1082. print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  1083. if sys.platform == "win32":
  1084. print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1085. elif sys.platform == "darwin":
  1086. print(e)
  1087. print(
  1088. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  1089. elif sys.platform == "linux":
  1090. print(e)
  1091. print(
  1092. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  1093. else:
  1094. print(e)
  1095. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1096. elif p["contentType"] == 9:
  1097. content = self.execute_code(
  1098. 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
  1099. elif p["contentType"] == 12: # 系统命令返回值
  1100. content = self.execute_code(1, p["JS"], p["JSWaitTime"])
  1101. elif p["contentType"] == 10: # 下拉框选中的值
  1102. try:
  1103. select_element = Select(element)
  1104. content = select_element.first_selected_option.get_attribute(
  1105. "value")
  1106. except:
  1107. content = ""
  1108. elif p["contentType"] == 11: # 下拉框选中的文本
  1109. try:
  1110. select_element = Select(element)
  1111. content = select_element.first_selected_option.text
  1112. except:
  1113. content = ""
  1114. return content
  1115. # 提取数据事件
  1116. def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
  1117. try:
  1118. pageHTML = etree.HTML(self.browser.page_source)
  1119. except:
  1120. pageHTML = etree.HTML("")
  1121. if loopElement != "": # 只在数据在循环中提取时才需要获取循环元素
  1122. try:
  1123. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  1124. except:
  1125. try: # 循环点击每个链接如果没有新标签页打开,loopElement会丢失,此时需要重新获取
  1126. elements = self.browser.find_elements(
  1127. By.XPATH, parentPath, iframe=para["paras"][0]["iframe"])
  1128. loopElement = elements[index]
  1129. loopElementOuterHTML = loopElement.get_attribute(
  1130. 'outerHTML')
  1131. except:
  1132. loopElementOuterHTML = ""
  1133. else:
  1134. loopElementOuterHTML = ""
  1135. loopElementHTML = etree.HTML(loopElementOuterHTML)
  1136. for p in para["paras"]:
  1137. if p["optimizable"]:
  1138. try:
  1139. # 只有当前环境不变变化才可以快速提取数据
  1140. if self.browser.iframe_env != p["iframe"]:
  1141. p["optimizable"] = False
  1142. continue
  1143. # p["relativeXPath"] = p["relativeXPath"].lower()
  1144. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1145. if p["nodeType"] == 2:
  1146. if p["relativeXPath"].find("/@href") >= 0:
  1147. xpath = p["relativeXPath"]
  1148. else:
  1149. xpath = p["relativeXPath"] + "/@href"
  1150. elif p["contentType"] == 1:
  1151. # 已经有text()了,不需要再加
  1152. if p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
  1153. xpath = p["relativeXPath"]
  1154. else:
  1155. xpath = p["relativeXPath"] + "/text()"
  1156. elif p["contentType"] == 0:
  1157. if p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
  1158. xpath = p["relativeXPath"]
  1159. else:
  1160. xpath = p["relativeXPath"] + "//text()"
  1161. if p["relative"]:
  1162. # if p["relativeXPath"] == "":
  1163. # content = [loopElementHTML]
  1164. # else:
  1165. # 如果字串里有//即子孙查找,则不动语句
  1166. if p["relativeXPath"].find("//") >= 0:
  1167. full_path = "(" + parentPath + \
  1168. xpath + ")" + \
  1169. "[" + str(index + 1) + "]"
  1170. content = pageHTML.xpath(full_path)
  1171. else:
  1172. content = loopElementHTML.xpath(
  1173. "/html/body/" + loopElementHTML[0][0].tag + xpath)
  1174. else:
  1175. if xpath.find("/body") < 0:
  1176. xpath = "/html/body" + xpath
  1177. content = pageHTML.xpath(xpath)
  1178. if len(content) > 0:
  1179. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  1180. # 拼接所有文本内容并去掉两边的空白
  1181. content = ' '.join(result.strip()
  1182. for result in content if result.strip())
  1183. else:
  1184. content = p["default"]
  1185. if not self.dataNotFoundKeys[p["name"]]:
  1186. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1187. p["relativeXPath"], p["name"]))
  1188. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1189. p["name"], p["relativeXPath"]))
  1190. self.dataNotFoundKeys[p["name"]] = True
  1191. self.recordLog(
  1192. 'Element %s not found, use default' % p["relativeXPath"])
  1193. except Exception as e:
  1194. if not self.dataNotFoundKeys[p["name"]]:
  1195. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1196. p["relativeXPath"], p["name"]))
  1197. print("提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
  1198. p["name"], p["relativeXPath"]))
  1199. self.dataNotFoundKeys[p["name"]] = True
  1200. self.recordLog(
  1201. 'Element %s not found, use default' % p["relativeXPath"])
  1202. self.outputParameters[p["name"]] = content
  1203. # 对于不能优化的操作,使用selenium执行
  1204. for p in para["paras"]:
  1205. if not p["optimizable"]:
  1206. content = ""
  1207. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  1208. try:
  1209. # p["relativeXPath"] = p["relativeXPath"].lower()
  1210. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1211. if p["relative"]: # 是否相对xpath
  1212. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1213. element = loopElement
  1214. else:
  1215. # 如果字串里有//即子孙查找,则不动语句
  1216. if p["relativeXPath"].find("//") >= 0:
  1217. full_path = "(" + parentPath + \
  1218. p["relativeXPath"] + ")" + \
  1219. "[" + str(index + 1) + "]"
  1220. element = self.browser.find_element(
  1221. By.XPATH, full_path, iframe=p["iframe"])
  1222. else:
  1223. element = loopElement.find_element(By.XPATH,
  1224. p["relativeXPath"][1:])
  1225. else:
  1226. element = self.browser.find_element(
  1227. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1228. except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
  1229. # print(p)
  1230. try:
  1231. content = p["default"]
  1232. except Exception as e:
  1233. content = ""
  1234. self.outputParameters[p["name"]] = content
  1235. try:
  1236. if not self.dataNotFoundKeys[p["name"]]:
  1237. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1238. p["relativeXPath"], p["name"]))
  1239. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1240. p["name"], p["relativeXPath"]))
  1241. self.dataNotFoundKeys[p["name"]] = True
  1242. self.recordLog(
  1243. 'Element %s not found, use default' % p["relativeXPath"])
  1244. except:
  1245. pass
  1246. continue
  1247. except TimeoutException: # 超时的时候设置超时值
  1248. self.Log('Time out after set seconds when getting data')
  1249. self.recordLog(
  1250. 'Time out after set seconds when getting data')
  1251. self.browser.execute_script('window.stop()')
  1252. if p["relative"]: # 是否相对xpath
  1253. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1254. element = loopElement
  1255. else:
  1256. element = loopElement.find_element(By.XPATH,
  1257. p["relativeXPath"][1:])
  1258. else:
  1259. element = self.browser.find_element(
  1260. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1261. # rt.end()
  1262. else:
  1263. element = self.browser.find_element(
  1264. By.XPATH, "//body", iframe=p["iframe"])
  1265. try:
  1266. self.execute_code(
  1267. 2, p["beforeJS"], p["beforeJSWaitTime"], element, iframe=p["iframe"]) # 执行前置js
  1268. content = self.get_content(p, element)
  1269. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  1270. self.recordLog(
  1271. 'StaleElementReferenceException: '+p["relativeXPath"])
  1272. time.sleep(3)
  1273. try:
  1274. if p["relative"]: # 是否相对xpath
  1275. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1276. element = loopElement
  1277. self.recordLog(
  1278. 'StaleElementReferenceException: loopElement')
  1279. else:
  1280. element = loopElement.find_element(By.XPATH,
  1281. p["relativeXPath"][1:])
  1282. self.recordLog(
  1283. 'StaleElementReferenceException: loopElement+relativeXPath')
  1284. else:
  1285. element = self.browser.find_element(
  1286. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1287. self.recordLog(
  1288. 'StaleElementReferenceException: relativeXPath')
  1289. content = self.get_content(p, element)
  1290. except StaleElementReferenceException:
  1291. self.recordLog(
  1292. 'StaleElementReferenceException: '+p["relativeXPath"])
  1293. continue # 再出现类似问题直接跳过
  1294. self.outputParameters[p["name"]] = content
  1295. self.execute_code(
  1296. 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
  1297. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  1298. self.OUTPUT.append(line)
  1299. # rt.end()
  1300. if __name__ == '__main__':
  1301. config = {
  1302. "id": [0],
  1303. "saved_file_name": "",
  1304. "user_data": False,
  1305. "config_folder": "",
  1306. "config_file_name": "config.json",
  1307. "read_type": "remote",
  1308. "headless": False,
  1309. "server_address": "http://localhost:8074",
  1310. "version": "0.3.5",
  1311. }
  1312. c = Config(config)
  1313. print(c)
  1314. options = Options()
  1315. driver_path = "chromedriver.exe"
  1316. import platform
  1317. print(sys.platform, platform.architecture())
  1318. option = webdriver.ChromeOptions()
  1319. if not os.path.exists(os.getcwd()+"/Data"):
  1320. os.mkdir(os.getcwd()+"/Data")
  1321. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  1322. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1323. # MacOS需要用option而不是options!
  1324. option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1325. option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1326. options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1327. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  1328. # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1329. # # MacOS需要用option而不是options!
  1330. # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1331. # driver_path = os.getcwd()+ "/chromedriver_mac64"
  1332. print(driver_path)
  1333. elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
  1334. print("Finding chromedriver in EasySpider",
  1335. os.getcwd()+"/EasySpider")
  1336. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  1337. options.binary_location = os.path.join(
  1338. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1339. driver_path = os.path.join(
  1340. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  1341. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1342. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  1343. options.binary_location = os.path.join(
  1344. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1345. driver_path = os.path.join(
  1346. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  1347. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1348. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  1349. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1350. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  1351. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1352. else:
  1353. print("Unsupported platform")
  1354. sys.exit()
  1355. print("Chrome location:", options.binary_location)
  1356. print("Chromedriver location:", driver_path)
  1357. # elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
  1358. # print("Finding chromedriver in ./Chrome",
  1359. # os.getcwd()+"/Chrome")
  1360. # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
  1361. # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
  1362. # driver_path = "./Chrome/chromedriver.exe"
  1363. elif os.path.exists(os.getcwd()+"/../ElectronJS"):
  1364. # 软件dev用
  1365. print("Finding chromedriver in EasySpider",
  1366. os.getcwd()+"/ElectronJS")
  1367. option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1368. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  1369. option.add_extension("../ElectronJS/XPathHelper.crx")
  1370. else:
  1371. options.binary_location = "./chrome.exe" # 指定chrome位置
  1372. driver_path = "./chromedriver.exe"
  1373. option.add_extension("XPathHelper.crx")
  1374. option.add_experimental_option(
  1375. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  1376. options.add_argument('-ignore-certificate-errors')
  1377. options.add_argument('-ignore -ssl-errors')
  1378. option.add_argument('-ignore-certificate-errors')
  1379. option.add_argument('-ignore -ssl-errors')
  1380. # user_data_dir = r'' # 注意没有Default!
  1381. # options.add_argument('--user-data-dir='+p)
  1382. # 总结:
  1383. # 0. 带Cookie需要用userdatadir
  1384. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  1385. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  1386. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  1387. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  1388. try:
  1389. with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
  1390. config = json.load(f)
  1391. absolute_user_data_folder = config["absolute_user_data_folder"]
  1392. print("\nAbsolute_user_data_folder:",
  1393. absolute_user_data_folder, "\n")
  1394. except:
  1395. pass
  1396. if c.user_data:
  1397. option.add_argument(
  1398. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1399. option.add_argument("--profile-directory=Default")
  1400. if c.headless:
  1401. print("Headless mode")
  1402. print("无头模式")
  1403. option.add_argument("--headless")
  1404. options.add_argument("--headless")
  1405. # options.add_argument(
  1406. # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
  1407. option.add_argument(
  1408. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1409. options.add_argument(
  1410. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1411. threads = []
  1412. for i in c.id:
  1413. print(options)
  1414. print("id: ", i)
  1415. if c.read_type == "remote":
  1416. print("remote")
  1417. content = requests.get(
  1418. c.server_address + "/queryExecutionInstance?id=" + str(i))
  1419. service = json.loads(content.text) # 加载服务信息
  1420. else:
  1421. print("local")
  1422. with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f:
  1423. content = f.read()
  1424. service = json.loads(content) # 加载服务信息
  1425. print("Task Name:", service["name"])
  1426. print("任务名称:", service["name"])
  1427. try:
  1428. cloudflare = service["cloudflare"]
  1429. except:
  1430. cloudflare = 0
  1431. if cloudflare == 0:
  1432. options.add_experimental_option("prefs", {
  1433. # 设置文件下载路径
  1434. "download.default_directory": "Data/Task_" + str(i),
  1435. "download.prompt_for_download": False, # 禁止下载提示框
  1436. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1437. "download.directory_upgrade": True,
  1438. "download.extensions_to_open": "applications/pdf",
  1439. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1440. })
  1441. option.add_experimental_option("prefs", {
  1442. # 设置文件下载路径
  1443. "download.default_directory": "Data/Task_" + str(i),
  1444. "download.prompt_for_download": False, # 禁止下载提示框
  1445. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1446. "download.directory_upgrade": True,
  1447. "download.extensions_to_open": "applications/pdf",
  1448. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1449. })
  1450. try:
  1451. if service["environment"] == 1:
  1452. option.add_experimental_option(
  1453. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1454. options.add_experimental_option(
  1455. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1456. except:
  1457. pass
  1458. browser_t = MyChrome(
  1459. options=options, chrome_options=option, executable_path=driver_path)
  1460. elif cloudflare == 1:
  1461. browser_t = MyUCChrome(
  1462. options=options, chrome_options=option, executable_path=driver_path)
  1463. print("Pass Cloudflare Mode")
  1464. print("过Cloudflare验证模式")
  1465. event = Event()
  1466. event.set()
  1467. thread = BrowserThread(browser_t, i, service,
  1468. c.version, event, c.saved_file_name, config=config)
  1469. print("Thread with task id: ", i, " is created")
  1470. threads.append(thread)
  1471. thread.start()
  1472. Thread(target=check_pause, args=("p", event)).start()
  1473. time.sleep(5)
  1474. print("\n\n----------------------------------")
  1475. print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1476. print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1477. print("----------------------------------\n\n")
  1478. for thread in threads:
  1479. thread.join()
  1480. for thread in threads:
  1481. thread.browser.quit()
  1482. # print("Thread with task id: ", thread.id, " is closed")
  1483. print("程序已运行完成,请手动关闭此窗口。")
  1484. print("The program has finished running, please manually close this window.")