easyspider_executestage.py 81 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610
  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. from datetime import datetime
  4. import io # 遇到错误退出时应执行的代码
  5. import json
  6. # from lib2to3.pgen2 import driver
  7. import re
  8. # import shutil
  9. import subprocess
  10. import sys
  11. # from urllib import parse
  12. # import base64
  13. # import hashlib
  14. import time
  15. import requests
  16. from urllib.parse import urljoin
  17. from lxml import etree
  18. # import undetected_chromedriver as uc
  19. from pynput.keyboard import Key, Listener
  20. from selenium.webdriver.chrome.options import Options
  21. from selenium.webdriver.common.keys import Keys
  22. from selenium.webdriver.common.action_chains import ActionChains
  23. from selenium import webdriver
  24. from selenium.webdriver.support.ui import WebDriverWait
  25. from selenium.webdriver.support import expected_conditions as EC
  26. from selenium.webdriver.common.by import By
  27. from selenium.common.exceptions import NoSuchElementException
  28. from selenium.common.exceptions import TimeoutException
  29. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  30. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  31. from selenium.webdriver.support.ui import Select
  32. from selenium.webdriver import ActionChains
  33. from selenium.webdriver.common.by import By
  34. import random
  35. # import pandas as pd
  36. from openpyxl import load_workbook, Workbook
  37. # import numpy
  38. import csv
  39. import os
  40. from commandline_config import Config
  41. import pytesseract
  42. from PIL import Image
  43. # import uuid
  44. from threading import Thread, Event
  45. from myChrome import MyChrome
  46. if sys.platform != "darwin":
  47. from myChrome import MyUCChrome
  48. from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
  49. desired_capabilities = DesiredCapabilities.CHROME
  50. desired_capabilities["pageLoadStrategy"] = "none"
  51. class BrowserThread(Thread):
  52. def __init__(self, browser_t, id, service, version, event, saveName, config):
  53. Thread.__init__(self)
  54. self.browser = browser_t
  55. self.config = config
  56. self.id = id
  57. self.event = event
  58. try:
  59. self.saveName = service["saveName"] # 保存文件的名字
  60. except:
  61. now = datetime.now()
  62. # 将时间格式化为精确到秒的字符串
  63. self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
  64. self.log = ""
  65. self.OUTPUT = ""
  66. self.SAVED = False
  67. self.BREAK = False
  68. # 名称设定
  69. if saveName != "": # 命令行覆盖保存名称
  70. self.saveName = saveName # 保存文件的名字
  71. now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  72. self.saveName = self.saveName.replace("current_time", now)
  73. print("Save Name for task ID", i, "is:", self.saveName)
  74. print("任务ID", i, "的保存文件名为:", self.saveName)
  75. if not os.path.exists("Data/Task_" + str(i)):
  76. os.mkdir("Data/Task_" + str(i))
  77. if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
  78. os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
  79. stealth_path = driver_path[:driver_path.find(
  80. "chromedriver")] + "stealth.min.js"
  81. with open(stealth_path, 'r') as f:
  82. js = f.read()
  83. print("Loading stealth.min.js")
  84. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  85. 'source': js}) # TMALL 反扒
  86. WebDriverWait(self.browser, 10)
  87. self.browser.get('about:blank')
  88. self.procedure = service["graph"] # 程序执行流程
  89. try:
  90. self.maxViewLength = service["maxViewLength"] # 最大显示长度
  91. except:
  92. self.maxViewLength = 15
  93. try:
  94. self.outputFormat = service["outputFormat"] # 输出格式
  95. except:
  96. self.outputFormat = "csv"
  97. try:
  98. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  99. pass
  100. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  101. if service["version"] != version:
  102. print("版本不一致,请使用" +
  103. service["version"] + "版本的EasySpider运行该任务!")
  104. print("Version not match, please use EasySpider " +
  105. service["version"] + " to run this task!")
  106. self.browser.quit()
  107. sys.exit()
  108. except: # 0.2.0版本没有version字段,所以直接退出
  109. print("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  110. print("Version not match, please use EasySpider v0.2.0 to run this task!")
  111. self.browser.quit()
  112. sys.exit()
  113. try:
  114. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  115. except:
  116. self.save_threshold = 10
  117. self.links = list(
  118. filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
  119. self.OUTPUT = [] # 采集的数据
  120. self.writeMode = 1 # 写入模式,0为新建,1为追加
  121. if self.outputFormat == "csv" or self.outputFormat == "txt":
  122. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  123. self.OUTPUT.append([]) # 添加表头
  124. self.writeMode = 0
  125. elif self.outputFormat == "xlsx":
  126. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
  127. self.OUTPUT.append([]) # 添加表头
  128. self.writeMode = 0
  129. elif self.outputFormat == "mysql":
  130. self.mysql = myMySQL(config["mysql_config_path"])
  131. self.mysql.create_table(self.saveName, service["outputParameters"])
  132. self.writeMode = 2
  133. if self.writeMode == 1:
  134. print("追加模式")
  135. print("Append Mode")
  136. elif self.writeMode == 0:
  137. print("新建模式")
  138. print("New Mode")
  139. elif self.writeMode == 2:
  140. print("MySQL模式")
  141. print("MySQL Mode")
  142. self.containJudge = service["containJudge"] # 是否含有判断语句
  143. self.outputParameters = {}
  144. self.outputParametersTypes = []
  145. self.outputParametersRecord = [] # 字段是否被记录
  146. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  147. self.log = "" # 记下现在总共开了多少个标签页
  148. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  149. self.SAVED = False # 记录是否已经存储了
  150. for para in service["outputParameters"]: # 初始化输出参数
  151. if para["name"] not in self.outputParameters.keys():
  152. self.outputParameters[para["name"]] = ""
  153. self.dataNotFoundKeys[para["name"]] = False
  154. try:
  155. self.outputParametersTypes.append(para["type"])
  156. except:
  157. self.outputParametersTypes.append("text")
  158. try:
  159. self.outputParametersRecord.append(bool(para["recordASField"]))
  160. except:
  161. self.outputParametersRecord.append(True)
  162. # 文件叠加的时候不添加表头
  163. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  164. if self.writeMode == 0:
  165. self.OUTPUT[0].append(para["name"])
  166. self.urlId = 0 # 全局记录变量
  167. self.preprocess() # 预处理,优化提取数据流程
  168. # 检测如果没有复杂的操作,优化提取数据流程
  169. def preprocess(self):
  170. for node in self.procedure:
  171. try:
  172. iframe = node["parameters"]["iframe"]
  173. except:
  174. node["parameters"]["iframe"] = False
  175. try:
  176. node["parameters"]["xpath"] = lowercase_tags_in_xpath(
  177. node["parameters"]["xpath"])
  178. except:
  179. pass
  180. if node["option"] == 1: # 打开网页操作
  181. try:
  182. cookies = node["parameters"]["cookies"]
  183. except:
  184. node["parameters"]["cookies"] = ""
  185. if node["option"] == 3: # 提取数据操作
  186. paras = node["parameters"]["paras"]
  187. for para in paras:
  188. try:
  189. iframe = para["iframe"]
  190. except:
  191. para["iframe"] = False
  192. try:
  193. para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
  194. except:
  195. pass
  196. if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
  197. para["optimizable"] = True
  198. else:
  199. para["optimizable"] = False
  200. def run(self):
  201. # 挨个执行程序
  202. for i in range(len(self.links)):
  203. print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
  204. print("Executing link", i + 1, "/ ", len(self.links))
  205. self.executeNode(0)
  206. self.urlId = self.urlId + 1
  207. files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  208. # 如果目录为空,则删除该目录
  209. if not files:
  210. os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  211. print("Done!")
  212. print("执行完成!")
  213. self.recordLog("Done!")
  214. self.saveData(exit=True)
  215. if self.outputFormat == "mysql":
  216. self.mysql.close()
  217. def recordLog(self, str=""):
  218. self.log = self.log + str + "\n"
  219. # 控制台打印log函数
  220. def Log(self, text, text2=""):
  221. switch = False
  222. if switch:
  223. print(text, text2)
  224. # @atexit.register
  225. # def clean(self):
  226. # self.saveData(exit=True)
  227. # self.browser.quit()
  228. # sys.exit(0)
  229. def saveData(self, exit=False):
  230. # 每save_threshold条保存一次
  231. if exit == True or len(self.OUTPUT) >= self.save_threshold:
  232. # 写入日志
  233. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
  234. file_obj.write(self.log)
  235. file_obj.close()
  236. # 写入数据
  237. if self.outputFormat == "csv" or self.outputFormat == "txt":
  238. file_name = "Data/Task_" + \
  239. str(self.id) + "/" + self.saveName + '.' + self.outputFormat
  240. write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
  241. elif self.outputFormat == "xlsx":
  242. file_name = "Data/Task_" + \
  243. str(self.id) + "/" + self.saveName + '.xlsx'
  244. write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
  245. elif self.outputFormat == "mysql":
  246. self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
  247. self.OUTPUT = []
  248. self.log = ""
  249. def scrollDown(self, para, rt=""):
  250. try:
  251. time.sleep(para["scrollWaitTime"]) # 下拉前等待
  252. except:
  253. pass
  254. scrollType = int(para["scrollType"])
  255. try:
  256. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  257. if scrollType == 1 or scrollType == 2:
  258. for i in range(para["scrollCount"]):
  259. self.Log("Wait for set second after screen scrolling")
  260. body = self.browser.find_element(
  261. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  262. if scrollType == 1:
  263. body.send_keys(Keys.PAGE_DOWN)
  264. elif scrollType == 2:
  265. body.send_keys(Keys.END)
  266. try:
  267. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  268. except:
  269. pass
  270. elif scrollType == 3:
  271. bodyText = ""
  272. i = 0
  273. while True:
  274. newBodyText = self.browser.page_source
  275. if newBodyText == bodyText:
  276. print("页面已检测不到新内容,停止滚动。")
  277. print("No new content detected on the page, stop scrolling.")
  278. break
  279. else:
  280. bodyText = newBodyText
  281. body = self.browser.find_element(
  282. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  283. body.send_keys(Keys.END)
  284. print("滚动到底部,第", i + 1, "次。")
  285. print("Scroll to the bottom, the", i + 1, "time.")
  286. i = i + 1
  287. try:
  288. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  289. except:
  290. pass
  291. except:
  292. self.Log('Time out after set seconds when scrolling. ')
  293. self.recordLog('Time out after set seconds when scrolling')
  294. try:
  295. self.browser.execute_script('window.stop()')
  296. except:
  297. pass
  298. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  299. for i in range(para["scrollCount"]):
  300. self.Log("Wait for set second after screen scrolling")
  301. body = self.browser.find_element(
  302. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  303. if scrollType == 1:
  304. body.send_keys(Keys.PGDN)
  305. elif scrollType == 2:
  306. body.send_keys(Keys.END)
  307. try:
  308. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  309. except:
  310. pass
  311. if rt != "":
  312. rt.end()
  313. def execute_code(self, codeMode, code, max_wait_time, element=None, iframe=False):
  314. output = ""
  315. if code == "":
  316. return ""
  317. if max_wait_time == 0:
  318. max_wait_time = 999999
  319. # print(codeMode, code)
  320. # 将value中的Field[""]替换为outputParameters中的键值
  321. pattern = r'Field\["([^"]+)"\]'
  322. try:
  323. replaced_text = re.sub(
  324. pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
  325. except:
  326. replaced_text = code
  327. code = replaced_text
  328. if iframe and self.browser.iframe_env == False:
  329. # 获取所有的 iframe
  330. self.browser.switch_to.default_content()
  331. iframes = self.browser.find_elements(
  332. By.CSS_SELECTOR, "iframe", iframe=False)
  333. # 遍历所有的 iframe 并点击里面的元素
  334. for iframe in iframes:
  335. # 切换到 iframe
  336. try:
  337. self.browser.switch_to.default_content()
  338. self.browser.switch_to.frame(iframe)
  339. self.browser.iframe_env = True
  340. break
  341. except:
  342. print("Iframe switch failed")
  343. elif not iframe and self.browser.iframe_env == True:
  344. self.browser.switch_to.default_content()
  345. self.browser.iframe_env = False
  346. if int(codeMode) == 0:
  347. self.recordLog("Execute JavaScript:" + code)
  348. self.recordLog("执行JavaScript:" + code)
  349. self.browser.set_script_timeout(max_wait_time)
  350. try:
  351. output = self.browser.execute_script(code)
  352. except:
  353. output = ""
  354. self.recordLog("JavaScript execution failed")
  355. elif int(codeMode) == 2:
  356. self.recordLog("Execute JavaScript for element:" + code)
  357. self.recordLog("对元素执行JavaScript:" + code)
  358. self.browser.set_script_timeout(max_wait_time)
  359. try:
  360. output = self.browser.execute_script(code, element)
  361. except:
  362. output = ""
  363. self.recordLog("JavaScript execution failed")
  364. elif int(codeMode) == 1:
  365. self.recordLog("Execute System Call:" + code)
  366. self.recordLog("执行系统命令:" + code)
  367. # 执行系统命令
  368. try:
  369. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  370. output = subprocess.run(
  371. code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  372. # 输出命令返回值
  373. output = output.stdout
  374. print(output)
  375. except subprocess.TimeoutExpired:
  376. # 命令执行时间超过指定值,抛出异常
  377. self.recordLog("Command timed out")
  378. self.recordLog("命令执行超时")
  379. except Exception as e:
  380. print(e) # 打印异常信息
  381. self.recordLog("Command execution failed")
  382. self.recordLog("命令执行失败")
  383. return str(output)
  384. def customOperation(self, node, loopValue, loopPath, index):
  385. paras = node["parameters"]
  386. codeMode = int(paras["codeMode"])
  387. code = paras["code"]
  388. output = ""
  389. max_wait_time = int(paras["waitTime"])
  390. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  391. try:
  392. elements = self.browser.find_elements(
  393. By.XPATH, loopPath, iframe=paras["iframe"])
  394. element = elements[index]
  395. output = self.execute_code(
  396. codeMode, code, max_wait_time, element, iframe=paras["iframe"])
  397. except:
  398. output = ""
  399. print("JavaScript execution failed")
  400. elif codeMode == 3:
  401. self.BREAK = True
  402. else: # 0 1
  403. output = self.execute_code(
  404. codeMode, code, max_wait_time, iframe=paras["iframe"])
  405. recordASField = bool(paras["recordASField"])
  406. if recordASField:
  407. print("操作<" + node["title"] + ">的返回值为:" + output)
  408. print("The return value of operation <" + node["title"] + "> is: " + output)
  409. self.outputParameters[node["title"]] = output
  410. if recordASField:
  411. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  412. self.OUTPUT.append(line)
  413. def switchSelect(self, para, loopValue):
  414. optionMode = int(para["optionMode"])
  415. optionValue = para["optionValue"]
  416. try:
  417. dropdown = Select(self.browser.find_element(
  418. By.XPATH, para["xpath"], iframe=para["iframe"]))
  419. try:
  420. if optionMode == 0:
  421. # 获取当前选中的选项索引
  422. current_index = dropdown.options.index(
  423. dropdown.first_selected_option)
  424. # 计算下一个选项的索引
  425. next_index = (current_index + 1) % len(dropdown.options)
  426. # 选择下一个选项
  427. dropdown.select_by_index(next_index)
  428. elif optionMode == 1:
  429. dropdown.select_by_index(int(optionValue))
  430. elif optionMode == 2:
  431. dropdown.select_by_value(optionValue)
  432. elif optionMode == 3:
  433. dropdown.select_by_visible_text(optionValue)
  434. except:
  435. print("切换下拉框选项失败:", para["xpath"],
  436. para["optionMode"], para["optionValue"])
  437. print("Failed to change drop-down box option:",
  438. para["xpath"], para["optionMode"], para["optionValue"])
  439. except:
  440. print("找不到下拉框元素:", para["xpath"])
  441. print("Cannot find drop-down box element:", para["xpath"])
  442. def moveToElement(self, para, loopElement=None, loopPath="", index=0):
  443. time.sleep(0.1) # 移动之前等待0.1秒
  444. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  445. path = loopPath
  446. # element = loopElement
  447. else:
  448. index = 0
  449. path = para["xpath"] # 不然使用元素定义的xpath
  450. # element = self.browser.find_element(
  451. # By.XPATH, path, iframe=para["iframe"])
  452. try:
  453. elements = self.browser.find_elements(
  454. By.XPATH, path, iframe=para["iframe"])
  455. element = elements[index]
  456. try:
  457. ActionChains(self.browser).move_to_element(element).perform()
  458. except:
  459. print("移动鼠标到元素失败:", para["xpath"])
  460. print("Failed to move mouse to element:", para["xpath"])
  461. except:
  462. print("找不到元素:", para["xpath"])
  463. print("Cannot find element:", para["xpath"])
  464. # 执行节点关键函数部分
  465. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  466. node = self.procedure[nodeId]
  467. WebDriverWait(self.browser, 10).until
  468. # 等待元素出现才进行操作,10秒内未出现则报错
  469. (EC.visibility_of_element_located(
  470. (By.XPATH, node["parameters"]["xpath"])))
  471. # 根据不同选项执行不同操作
  472. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  473. for i in node["sequence"]: # 从根节点开始向下读取
  474. self.executeNode(i, loopValue, loopPath, index)
  475. elif node["option"] == 1: # 打开网页操作
  476. self.recordLog("openPage")
  477. self.openPage(node["parameters"], loopValue)
  478. elif node["option"] == 2: # 点击元素
  479. self.recordLog("Click")
  480. self.clickElement(node["parameters"], loopValue, loopPath, index)
  481. elif node["option"] == 3: # 提取数据
  482. self.recordLog("getData")
  483. self.getData(node["parameters"], loopValue, node["isInLoop"],
  484. parentPath=loopPath, index=index)
  485. self.saveData()
  486. elif node["option"] == 4: # 输入文字
  487. self.inputInfo(node["parameters"], loopValue)
  488. elif node["option"] == 5: # 自定义操作
  489. self.customOperation(node, loopValue, loopPath, index)
  490. self.saveData()
  491. elif node["option"] == 6: # 切换下拉框
  492. self.switchSelect(node["parameters"], loopValue)
  493. elif node["option"] == 7: # 鼠标移动到元素上
  494. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  495. elif node["option"] == 8: # 循环
  496. self.recordLog("loop")
  497. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  498. elif node["option"] == 9: # 条件分支
  499. self.recordLog("judge")
  500. self.judgeExecute(node, loopValue, loopPath, index)
  501. # 执行完之后进行等待
  502. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  503. waitTime = 0.01 # 默认等待0.01秒
  504. if node["parameters"]["wait"] >= 0:
  505. waitTime = node["parameters"]["wait"]
  506. try:
  507. waitType = int(node["parameters"]["waitType"])
  508. except:
  509. waitType = 0
  510. if waitType == 0: # 固定等待时间
  511. time.sleep(waitTime)
  512. elif waitType == 1: # 随机等待时间
  513. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  514. self.Log("Wait seconds after node executing: ", waitTime)
  515. self.event.wait() # 等待事件结束
  516. # 对判断条件的处理
  517. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  518. executeBranchId = 0 # 要执行的BranchId
  519. for i in node["sequence"]:
  520. cnode = self.procedure[i] # 获得条件分支
  521. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  522. if tType == 0: # 什么条件都没有
  523. executeBranchId = i
  524. break
  525. elif tType == 1: # 当前页面包含文本
  526. try:
  527. bodyText = self.browser.find_element(
  528. By.CSS_SELECTOR, "body", iframe=cnode["parameters"]["iframe"]).text
  529. if bodyText.find(cnode["parameters"]["value"]) >= 0:
  530. executeBranchId = i
  531. break
  532. except: # 找不到元素下一个条件
  533. continue
  534. elif tType == 2: # 当前页面包含元素
  535. try:
  536. if self.browser.find_element(By.XPATH, cnode["parameters"]["value"], iframe=cnode["parameters"]["iframe"]):
  537. executeBranchId = i
  538. break
  539. except: # 找不到元素或者xpath写错了,下一个条件
  540. continue
  541. elif tType == 3: # 当前循环元素包括文本
  542. try:
  543. if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
  544. executeBranchId = i
  545. break
  546. except: # 找不到元素或者xpath写错了,下一个条件
  547. continue
  548. elif tType == 4: # 当前循环元素包括元素
  549. try:
  550. if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
  551. executeBranchId = i
  552. break
  553. except: # 找不到元素或者xpath写错了,下一个条件
  554. continue
  555. elif tType <= 7: # JS命令返回值
  556. if tType == 5: # JS命令返回值等于
  557. output = self.execute_code(
  558. 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  559. elif tType == 6: # System
  560. output = self.execute_code(
  561. 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  562. elif tType == 7: # 针对当前循环项的JS命令返回值
  563. output = self.execute_code(
  564. 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"])
  565. try:
  566. if output.find("rue") != -1: # 如果返回值中包含true
  567. code = 1
  568. else:
  569. code = int(output)
  570. except:
  571. code = 0
  572. if code > 0:
  573. executeBranchId = i
  574. break
  575. # rt.end()
  576. if executeBranchId != 0:
  577. self.executeNode(executeBranchId, loopElement, clickPath, index)
  578. # 对循环的处理
  579. def loopExecute(self, node, loopValue, clickPath="", index=0):
  580. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  581. # self.Log("循环执行前等待0.1秒")
  582. self.Log("Wait 0.1 second before loop")
  583. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  584. thisHistoryLength = self.browser.execute_script(
  585. 'return history.length') # 记录本次循环内的history的length
  586. self.history["index"] = thisHistoryLength
  587. self.history["handle"] = thisHandle
  588. if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  589. # 无跳转标签页操作
  590. count = 0 # 执行次数
  591. bodyText = "-"
  592. while True: # do while循环
  593. try:
  594. finished = False
  595. newBodyText = self.browser.page_source
  596. if newBodyText == bodyText: # 如果页面内容无变化
  597. print("页面已检测不到新内容,停止循环。")
  598. print("No new content detected on the page, stop loop.")
  599. finished = True
  600. break
  601. else:
  602. bodyText = newBodyText
  603. element = self.browser.find_element(
  604. By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  605. for i in node["sequence"]: # 挨个执行操作
  606. self.executeNode(
  607. i, element, node["parameters"]["xpath"], 0)
  608. if self.BREAK: # 如果有break操作,下面的操作不执行
  609. break
  610. if self.BREAK: # 如果有break操作,退出循环
  611. self.BREAK = False
  612. finished = True
  613. break
  614. finished = True
  615. self.Log("Click: ", node["parameters"]["xpath"])
  616. self.recordLog("Click:" + node["parameters"]["xpath"])
  617. except NoSuchElementException:
  618. # except:
  619. print("Single loop element not found: ",
  620. node["parameters"]["xpath"])
  621. print("找不到要循环的单个元素: ", node["parameters"]["xpath"])
  622. self.recordLog(
  623. "Single loop element not found: " + node["parameters"]["xpath"])
  624. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  625. if node["option"] != 2:
  626. self.executeNode(
  627. i, None, node["parameters"]["xpath"], 0)
  628. finished = True
  629. break # 如果找不到元素,退出循环
  630. finally:
  631. if not finished:
  632. print("\n\n-------Retrying-------\n\n")
  633. self.Log("-------Retrying-------: ",
  634. node["parameters"]["xpath"])
  635. self.recordLog("ClickNotFound:" +
  636. node["parameters"]["xpath"])
  637. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  638. if node["option"] != 2:
  639. self.executeNode(
  640. i, None, node["parameters"]["xpath"], 0)
  641. break # 如果找不到元素,退出循环
  642. count = count + 1
  643. self.Log("Page: ", count)
  644. self.recordLog("Page:" + str(count))
  645. # print(node["parameters"]["exitCount"], "-------")
  646. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  647. break
  648. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  649. output = self.execute_code(int(
  650. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  651. code = get_output_code(output)
  652. if code <= 0:
  653. break
  654. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  655. try:
  656. elements = self.browser.find_elements(By.XPATH,
  657. node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  658. if len(elements) == 0:
  659. print("Loop element not found: ",
  660. node["parameters"]["xpath"])
  661. print("找不到循环元素: ", node["parameters"]["xpath"])
  662. self.recordLog("pathNotFound: " +
  663. node["parameters"]["xpath"])
  664. for index in range(len(elements)):
  665. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  666. self.executeNode(i, elements[index],
  667. node["parameters"]["xpath"], index)
  668. if self.BREAK:
  669. break
  670. if self.BREAK:
  671. self.BREAK = False
  672. break
  673. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  674. while True: # 一直关闭窗口直到当前标签页
  675. self.browser.close() # 关闭使用完的标签页
  676. self.browser.switch_to.window(
  677. self.browser.window_handles[-1])
  678. if self.browser.current_window_handle == thisHandle:
  679. break
  680. if self.history["index"] != thisHistoryLength and self.history[
  681. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  682. difference = thisHistoryLength - \
  683. self.history["index"] # 计算历史记录变化差值
  684. self.browser.execute_script(
  685. 'history.go(' + str(difference) + ')') # 回退历史记录
  686. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  687. time.sleep(node["parameters"]["historyWait"])
  688. # else:
  689. # time.sleep(2)
  690. # 切换历史记录等待:
  691. self.Log("Change history back time or:",
  692. node["parameters"]["historyWait"])
  693. try:
  694. self.browser.execute_script('window.stop()')
  695. except:
  696. pass
  697. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  698. output = self.execute_code(int(
  699. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  700. code = get_output_code(output)
  701. if code <= 0:
  702. break
  703. except NoSuchElementException:
  704. print("Loop element not found: ", node["parameters"]["xpath"])
  705. print("找不到循环元素: ", node["parameters"]["xpath"])
  706. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  707. except Exception as e:
  708. raise
  709. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  710. # 千万不要忘了分割!!
  711. for path in node["parameters"]["pathList"].split("\n"):
  712. try:
  713. element = self.browser.find_element(
  714. By.XPATH, path, iframe=node["parameters"]["iframe"])
  715. for i in node["sequence"]: # 挨个执行操作
  716. self.executeNode(i, element, path, 0)
  717. if self.BREAK:
  718. break
  719. if self.BREAK:
  720. self.BREAK = False
  721. break
  722. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  723. while True: # 一直关闭窗口直到当前标签页
  724. self.browser.close() # 关闭使用完的标签页
  725. self.browser.switch_to.window(
  726. self.browser.window_handles[-1])
  727. if self.browser.current_window_handle == thisHandle:
  728. break
  729. if self.history["index"] != thisHistoryLength and self.history[
  730. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  731. difference = thisHistoryLength - \
  732. self.history["index"] # 计算历史记录变化差值
  733. self.browser.execute_script(
  734. 'history.go(' + str(difference) + ')') # 回退历史记录
  735. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  736. time.sleep(node["parameters"]["historyWait"])
  737. # else:
  738. # time.sleep(2)
  739. self.Log("Change history back time or:",
  740. node["parameters"]["historyWait"])
  741. try:
  742. self.browser.execute_script('window.stop()')
  743. except:
  744. pass
  745. except NoSuchElementException:
  746. print("Loop element not found: ", path)
  747. print("找不到循环元素: ", path)
  748. self.recordLog("pathNotFound: " + path)
  749. continue # 循环中找不到元素就略过操作
  750. except Exception as e:
  751. raise
  752. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  753. output = self.execute_code(int(
  754. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  755. code = get_output_code(output)
  756. if code <= 0:
  757. break
  758. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  759. textList = node["parameters"]["textList"].split("\n")
  760. for text in textList:
  761. self.recordLog("input: " + text)
  762. for i in node["sequence"]: # 挨个执行操作
  763. self.executeNode(i, text, "", 0)
  764. if self.BREAK:
  765. break
  766. if self.BREAK:
  767. self.BREAK = False
  768. break
  769. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  770. output = self.execute_code(int(
  771. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  772. code = get_output_code(output)
  773. if code <= 0:
  774. break
  775. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  776. # tempList = node["parameters"]["textList"].split("\r\n")
  777. urlList = list(
  778. filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
  779. # urlList = []
  780. # for url in tempList:
  781. # if url != "":
  782. # urlList.append(url)
  783. for url in urlList:
  784. self.recordLog("input: " + url)
  785. for i in node["sequence"]:
  786. self.executeNode(i, url, "", 0)
  787. if self.BREAK:
  788. break
  789. if self.BREAK:
  790. self.BREAK = False
  791. break
  792. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  793. output = self.execute_code(int(
  794. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  795. code = get_output_code(output)
  796. if code <= 0:
  797. break
  798. elif int(node["parameters"]["loopType"]) <= 6: # 命令返回值
  799. while True: # do while循环
  800. if int(node["parameters"]["loopType"]) == 5: # JS
  801. output = self.execute_code(
  802. 0, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  803. elif int(node["parameters"]["loopType"]) == 6: # System
  804. output = self.execute_code(
  805. 1, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  806. code = get_output_code(output)
  807. if code <= 0:
  808. break
  809. for i in node["sequence"]: # 挨个执行操作
  810. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  811. if self.BREAK:
  812. break
  813. if self.BREAK:
  814. self.BREAK = False
  815. break
  816. self.history["index"] = thisHistoryLength
  817. self.history["handle"] = self.browser.current_window_handle
  818. self.scrollDown(node["parameters"])
  819. # 打开网页事件
  820. def openPage(self, para, loopValue):
  821. time.sleep(1) # 打开网页后强行等待至少1秒
  822. if len(self.browser.window_handles) > 1:
  823. self.browser.switch_to.window(
  824. self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  825. self.browser.close()
  826. self.browser.switch_to.window(
  827. self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  828. self.history["handle"] = self.browser.current_window_handle
  829. if para["useLoop"]:
  830. url = loopValue
  831. elif para["url"] != "about:blank":
  832. url = self.links[self.urlId]
  833. # clear output parameters
  834. for key in self.outputParameters:
  835. self.outputParameters[key] = ""
  836. else:
  837. url = list(filter(isnull, para["links"].split("\n")))[0]
  838. # 将value中的Field[""]替换为outputParameters中的键值
  839. pattern = r'Field\["([^"]+)"\]'
  840. try:
  841. replaced_text = re.sub(
  842. pattern, lambda match: self.outputParameters.get(match.group(1), ''), url)
  843. except:
  844. replaced_text = url
  845. url = replaced_text
  846. try:
  847. maxWaitTime = int(para["maxWaitTime"])
  848. except:
  849. maxWaitTime = 10 # 默认最大等待时间为10秒
  850. try:
  851. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  852. self.browser.set_script_timeout(maxWaitTime)
  853. self.browser.get(url)
  854. if para["cookies"] != "":
  855. self.browser.delete_all_cookies() # 清除所有已有cookie
  856. cookies = para["cookies"].split('\n')
  857. for cookie in cookies:
  858. name, value = cookie.split('=', 1)
  859. cookie_dict = {'name': name, 'value': value}
  860. # 加载 cookie
  861. self.browser.add_cookie(cookie_dict)
  862. self.Log('Loading page: ' + url)
  863. self.recordLog('Loading page: ' + url)
  864. except TimeoutException:
  865. self.Log('Time out after set seconds when loading page: ' + url)
  866. self.recordLog(
  867. 'Time out after set seconds when loading page: ' + url)
  868. try:
  869. self.browser.execute_script('window.stop()')
  870. except:
  871. pass
  872. except Exception as e:
  873. print("Failed to load page: " + url)
  874. self.recordLog('Failed to load page: ' + url)
  875. try:
  876. self.history["index"] = self.browser.execute_script(
  877. "return history.length")
  878. except TimeoutException:
  879. try:
  880. self.browser.execute_script('window.stop()')
  881. self.history["index"] = self.browser.execute_script(
  882. "return history.length")
  883. except:
  884. self.history["index"] = 0
  885. self.scrollDown(para) # 控制屏幕向下滚动
  886. # 键盘输入事件
  887. def inputInfo(self, para, loopValue):
  888. time.sleep(0.1) # 输入之前等待0.1秒
  889. self.Log("Wait 0.1 second before input")
  890. try:
  891. textbox = self.browser.find_element(
  892. By.XPATH, para["xpath"], iframe=para["iframe"])
  893. # textbox.send_keys(Keys.CONTROL, 'a')
  894. # textbox.send_keys(Keys.BACKSPACE)
  895. self.execute_code(
  896. 2, para["beforeJS"], para["beforeJSWaitTime"], textbox, iframe=para["iframe"]) # 执行前置JS
  897. # Send the HOME key
  898. textbox.send_keys(Keys.HOME)
  899. # Send the SHIFT + END key combination
  900. textbox.send_keys(Keys.SHIFT, Keys.END)
  901. # Send the DELETE key
  902. textbox.send_keys(Keys.DELETE)
  903. value = ""
  904. if para["useLoop"]:
  905. value = loopValue
  906. else:
  907. value = para["value"]
  908. # 将value中的Field[""]替换为outputParameters中的键值
  909. pattern = r'Field\["([^"]+)"\]'
  910. try:
  911. replaced_text = re.sub(
  912. pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  913. replaced_text = re.sub(
  914. '<enter>', '', replaced_text, flags=re.IGNORECASE)
  915. except:
  916. replaced_text = value
  917. textbox.send_keys(replaced_text)
  918. if value.lower().find("<enter>") >= 0:
  919. textbox.send_keys(Keys.ENTER)
  920. self.execute_code(
  921. 2, para["afterJS"], para["afterJSWaitTime"], textbox, iframe=para["iframe"]) # 执行后置js
  922. except:
  923. print("Cannot find input box element:" +
  924. para["xpath"] + ", please try to set the wait time before executing this operation")
  925. print("找不到输入框元素:" + para["xpath"] + ",请尝试在执行此操作前设置等待时间")
  926. self.recordLog("Cannot find input box element:" +
  927. para["xpath"] + "Please try to set the wait time before executing this operation")
  928. # 点击元素事件
  929. def clickElement(self, para, loopElement=None, clickPath="", index=0):
  930. try:
  931. maxWaitTime = int(para["maxWaitTime"])
  932. except:
  933. maxWaitTime = 10
  934. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  935. self.browser.set_script_timeout(maxWaitTime)
  936. # 点击前对该元素执行一段JavaScript代码
  937. try:
  938. # element = self.browser.find_element(
  939. # By.XPATH, path, iframe=para["iframe"])
  940. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  941. path = clickPath
  942. # element = loopElement
  943. else:
  944. index = 0
  945. path = para["xpath"] # 不然使用元素定义的xpath
  946. # element = self.browser.find_element(
  947. # By.XPATH, path, iframe=para["iframe"])
  948. elements = self.browser.find_elements(
  949. By.XPATH, path, iframe=para["iframe"])
  950. element = elements[index]
  951. if para["beforeJS"] != "":
  952. self.execute_code(2, para["beforeJS"],
  953. para["beforeJSWaitTime"], element, iframe=para["iframe"])
  954. except:
  955. print("Cannot find element:" +
  956. path + ", please try to set the wait time before executing this operation")
  957. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  958. self.recordLog("Cannot find element:" +
  959. path + ", please try to set the wait time before executing this operation")
  960. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  961. try:
  962. click_way = int(para["clickWay"])
  963. except:
  964. click_way = 0
  965. try:
  966. if click_way == 0: # 用selenium的点击方法
  967. actions = ActionChains(self.browser) # 实例化一个action对象
  968. actions.click(element).perform()
  969. elif click_way == 1: # 用js的点击方法
  970. script = 'var result = document.evaluate(`' + path + \
  971. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  972. self.browser.execute_script(script, str(index)) # 用js的点击方法
  973. except TimeoutException:
  974. self.Log('Time out after set seconds when loading clicked page')
  975. self.recordLog(
  976. 'Time out after set seconds when loading clicked page')
  977. try:
  978. self.browser.execute_script('window.stop()')
  979. except:
  980. pass
  981. except Exception as e:
  982. self.Log(e)
  983. self.recordLog(str(e))
  984. # 点击后对该元素执行一段JavaScript代码
  985. try:
  986. if para["afterJS"] != "":
  987. element = self.browser.find_element(
  988. By.XPATH, path, iframe=para["iframe"])
  989. self.execute_code(2, para["afterJS"],
  990. para["afterJSWaitTime"], element, iframe=para["iframe"])
  991. except:
  992. print("Cannot find element:" + path)
  993. self.recordLog("Cannot find element:" +
  994. path + ", please try to set the wait time before executing this operation")
  995. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  996. waitTime = float(para["wait"]) + 0.01 # 点击之后等待
  997. try:
  998. waitType = int(para["waitType"])
  999. except:
  1000. waitType = 0
  1001. if waitType == 0: # 固定等待时间
  1002. time.sleep(waitTime)
  1003. elif waitType == 1: # 随机等待时间
  1004. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  1005. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  1006. self.browser.switch_to.window(
  1007. self.browser.window_handles[-1]) # 跳转到新的标签页
  1008. self.history["handle"] = self.browser.current_window_handle
  1009. try:
  1010. self.history["index"] = self.browser.execute_script(
  1011. "return history.length")
  1012. except TimeoutException:
  1013. try:
  1014. self.browser.execute_script('window.stop()')
  1015. except:
  1016. pass
  1017. self.history["index"] = self.browser.execute_script(
  1018. "return history.length")
  1019. else:
  1020. try:
  1021. self.history["index"] = self.browser.execute_script(
  1022. "return history.length")
  1023. except TimeoutException:
  1024. try:
  1025. self.browser.execute_script('window.stop()')
  1026. except:
  1027. pass
  1028. self.history["index"] = self.browser.execute_script(
  1029. "return history.length")
  1030. # 如果打开了新窗口,切换到新窗口
  1031. self.scrollDown(para) # 根据参数配置向下滚动
  1032. # rt.end()
  1033. def get_content(self, p, element):
  1034. content = ""
  1035. if p["contentType"] == 0:
  1036. # 先处理特殊节点类型
  1037. if p["nodeType"] == 2:
  1038. if element.get_attribute("href") != None:
  1039. content = element.get_attribute("href")
  1040. else:
  1041. content = ""
  1042. elif p["nodeType"] == 3:
  1043. if element.get_attribute("value") != None:
  1044. content = element.get_attribute("value")
  1045. else:
  1046. content = ""
  1047. elif p["nodeType"] == 4: # 图片
  1048. if element.get_attribute("src") != None:
  1049. content = element.get_attribute("src")
  1050. else:
  1051. content = ""
  1052. try:
  1053. downloadPic = p["downloadPic"]
  1054. except:
  1055. downloadPic = 0
  1056. if downloadPic == 1:
  1057. download_image(content, "Data/Task_" +
  1058. str(self.id) + "/" + self.saveName + "/")
  1059. else: # 普通节点
  1060. content = element.text
  1061. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  1062. if p["nodeType"] == 2:
  1063. if element.get_attribute("href") != None:
  1064. content = element.get_attribute("href")
  1065. else:
  1066. content = ""
  1067. elif p["nodeType"] == 3:
  1068. if element.get_attribute("value") != None:
  1069. content = element.get_attribute("value")
  1070. else:
  1071. content = ""
  1072. elif p["nodeType"] == 4: # 图片
  1073. if element.get_attribute("src") != None:
  1074. content = element.get_attribute("src")
  1075. else:
  1076. content = ""
  1077. try:
  1078. downloadPic = p["downloadPic"]
  1079. except:
  1080. downloadPic = 0
  1081. if downloadPic == 1:
  1082. download_image(content, "Data/Task_" +
  1083. str(self.id) + "/" + self.saveName + "/")
  1084. else:
  1085. command = 'var arr = [];\
  1086. var content = arguments[0];\
  1087. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  1088. if(content.childNodes[i].nodeType === 3){ \
  1089. arr.push(content.childNodes[i].nodeValue);\
  1090. }\
  1091. }\
  1092. var str = arr.join(" "); \
  1093. return str;'
  1094. content = self.browser.execute_script(command, element).replace(
  1095. "\n", "").replace("\\s+", " ")
  1096. elif p["contentType"] == 2:
  1097. content = element.get_attribute('innerHTML')
  1098. elif p["contentType"] == 3:
  1099. content = element.get_attribute('outerHTML')
  1100. elif p["contentType"] == 4:
  1101. # 获取元素的背景图片地址
  1102. bg_url = element.value_of_css_property('background-image')
  1103. # 清除背景图片地址中的多余字符
  1104. bg_url = bg_url.replace('url("', '').replace('")', '')
  1105. content = bg_url
  1106. elif p["contentType"] == 5:
  1107. content = self.browser.current_url
  1108. elif p["contentType"] == 6:
  1109. content = self.browser.title
  1110. elif p["contentType"] == 7:
  1111. # 获取整个网页的高度和宽度
  1112. height = self.browser.execute_script(
  1113. "return document.body.scrollHeight")
  1114. width = self.browser.execute_script(
  1115. "return document.body.scrollWidth")
  1116. # 调整浏览器窗口的大小
  1117. self.browser.set_window_size(width, height)
  1118. element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
  1119. "/" + str(time.time()) + ".png")
  1120. elif p["contentType"] == 8:
  1121. try:
  1122. screenshot = element.screenshot_as_png
  1123. screenshot_stream = io.BytesIO(screenshot)
  1124. # 使用Pillow库打开截图,并转换为灰度图像
  1125. image = Image.open(screenshot_stream).convert('L')
  1126. # 使用Tesseract OCR引擎识别图像中的文本
  1127. text = pytesseract.image_to_string(image, lang='chi_sim+eng')
  1128. content = text
  1129. except Exception as e:
  1130. content = "OCR Error"
  1131. print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  1132. if sys.platform == "win32":
  1133. print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1134. elif sys.platform == "darwin":
  1135. print(e)
  1136. print(
  1137. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  1138. elif sys.platform == "linux":
  1139. print(e)
  1140. print(
  1141. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  1142. else:
  1143. print(e)
  1144. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1145. elif p["contentType"] == 9:
  1146. content = self.execute_code(
  1147. 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
  1148. elif p["contentType"] == 12: # 系统命令返回值
  1149. content = self.execute_code(1, p["JS"], p["JSWaitTime"])
  1150. elif p["contentType"] == 10: # 下拉框选中的值
  1151. try:
  1152. select_element = Select(element)
  1153. content = select_element.first_selected_option.get_attribute(
  1154. "value")
  1155. except:
  1156. content = ""
  1157. elif p["contentType"] == 11: # 下拉框选中的文本
  1158. try:
  1159. select_element = Select(element)
  1160. content = select_element.first_selected_option.text
  1161. except:
  1162. content = ""
  1163. return content
  1164. # 提取数据事件
  1165. def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
  1166. try:
  1167. pageHTML = etree.HTML(self.browser.page_source)
  1168. except:
  1169. pageHTML = etree.HTML("")
  1170. if loopElement != "": # 只在数据在循环中提取时才需要获取循环元素
  1171. try:
  1172. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  1173. except:
  1174. try: # 循环点击每个链接如果没有新标签页打开,loopElement会丢失,此时需要重新获取
  1175. elements = self.browser.find_elements(
  1176. By.XPATH, parentPath, iframe=para["paras"][0]["iframe"])
  1177. loopElement = elements[index]
  1178. loopElementOuterHTML = loopElement.get_attribute(
  1179. 'outerHTML')
  1180. except:
  1181. loopElementOuterHTML = ""
  1182. else:
  1183. loopElementOuterHTML = ""
  1184. loopElementHTML = etree.HTML(loopElementOuterHTML)
  1185. for p in para["paras"]:
  1186. if p["optimizable"]:
  1187. try:
  1188. # 只有当前环境不变变化才可以快速提取数据
  1189. if self.browser.iframe_env != p["iframe"]:
  1190. p["optimizable"] = False
  1191. continue
  1192. # p["relativeXPath"] = p["relativeXPath"].lower()
  1193. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1194. # 已经有text()或@href了,不需要再加
  1195. if p["relativeXPath"].find("/@href") >= 0 or p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
  1196. xpath = p["relativeXPath"]
  1197. elif p["nodeType"] == 2:
  1198. xpath = p["relativeXPath"] + "/@href"
  1199. elif p["contentType"] == 1:
  1200. xpath = p["relativeXPath"] + "/text()"
  1201. elif p["contentType"] == 0:
  1202. xpath = p["relativeXPath"] + "//text()"
  1203. if p["relative"]:
  1204. # if p["relativeXPath"] == "":
  1205. # content = [loopElementHTML]
  1206. # else:
  1207. # 如果字串里有//即子孙查找,则不动语句
  1208. if p["relativeXPath"].find("//") >= 0:
  1209. full_path = "(" + parentPath + \
  1210. xpath + ")" + \
  1211. "[" + str(index + 1) + "]"
  1212. content = pageHTML.xpath(full_path)
  1213. else:
  1214. content = loopElementHTML.xpath(
  1215. "/html/body/" + loopElementHTML[0][0].tag + xpath)
  1216. else:
  1217. if xpath.find("/body") < 0:
  1218. xpath = "/html/body" + xpath
  1219. content = pageHTML.xpath(xpath)
  1220. if len(content) > 0:
  1221. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  1222. # 拼接所有文本内容并去掉两边的空白
  1223. content = ' '.join(result.strip()
  1224. for result in content if result.strip())
  1225. if p["nodeType"] == 2:
  1226. base_url = self.browser.current_url
  1227. content = urljoin(base_url, content) # 合并链接相对路径为绝对路径
  1228. else:
  1229. content = p["default"]
  1230. if not self.dataNotFoundKeys[p["name"]]:
  1231. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1232. p["relativeXPath"], p["name"]))
  1233. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1234. p["name"], p["relativeXPath"]))
  1235. self.dataNotFoundKeys[p["name"]] = True
  1236. self.recordLog(
  1237. 'Element %s not found, use default' % p["relativeXPath"])
  1238. except Exception as e:
  1239. if not self.dataNotFoundKeys[p["name"]]:
  1240. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1241. p["relativeXPath"], p["name"]))
  1242. print("提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
  1243. p["name"], p["relativeXPath"]))
  1244. self.dataNotFoundKeys[p["name"]] = True
  1245. self.recordLog(
  1246. 'Element %s not found, use default' % p["relativeXPath"])
  1247. self.outputParameters[p["name"]] = content
  1248. # 对于不能优化的操作,使用selenium执行
  1249. for p in para["paras"]:
  1250. if not p["optimizable"]:
  1251. content = ""
  1252. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  1253. try:
  1254. # p["relativeXPath"] = p["relativeXPath"].lower()
  1255. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1256. if p["relative"]: # 是否相对xpath
  1257. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1258. element = loopElement
  1259. else:
  1260. # 如果字串里有//即子孙查找,则不动语句
  1261. if p["relativeXPath"].find("//") >= 0:
  1262. full_path = "(" + parentPath + \
  1263. p["relativeXPath"] + ")" + \
  1264. "[" + str(index + 1) + "]"
  1265. element = self.browser.find_element(
  1266. By.XPATH, full_path, iframe=p["iframe"])
  1267. else:
  1268. element = loopElement.find_element(By.XPATH,
  1269. p["relativeXPath"][1:])
  1270. else:
  1271. element = self.browser.find_element(
  1272. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1273. except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
  1274. # print(p)
  1275. try:
  1276. content = p["default"]
  1277. except Exception as e:
  1278. content = ""
  1279. self.outputParameters[p["name"]] = content
  1280. try:
  1281. if not self.dataNotFoundKeys[p["name"]]:
  1282. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1283. p["relativeXPath"], p["name"]))
  1284. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1285. p["name"], p["relativeXPath"]))
  1286. self.dataNotFoundKeys[p["name"]] = True
  1287. self.recordLog(
  1288. 'Element %s not found, use default' % p["relativeXPath"])
  1289. except:
  1290. pass
  1291. continue
  1292. except TimeoutException: # 超时的时候设置超时值
  1293. self.Log('Time out after set seconds when getting data')
  1294. self.recordLog(
  1295. 'Time out after set seconds when getting data')
  1296. try:
  1297. self.browser.execute_script('window.stop()')
  1298. except:
  1299. pass
  1300. if p["relative"]: # 是否相对xpath
  1301. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1302. element = loopElement
  1303. else:
  1304. element = loopElement.find_element(By.XPATH,
  1305. p["relativeXPath"][1:])
  1306. else:
  1307. element = self.browser.find_element(
  1308. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1309. # rt.end()
  1310. else:
  1311. element = self.browser.find_element(
  1312. By.XPATH, "//body", iframe=p["iframe"])
  1313. try:
  1314. self.execute_code(
  1315. 2, p["beforeJS"], p["beforeJSWaitTime"], element, iframe=p["iframe"]) # 执行前置js
  1316. content = self.get_content(p, element)
  1317. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  1318. self.recordLog(
  1319. 'StaleElementReferenceException: '+p["relativeXPath"])
  1320. time.sleep(3)
  1321. try:
  1322. if p["relative"]: # 是否相对xpath
  1323. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1324. element = loopElement
  1325. self.recordLog(
  1326. 'StaleElementReferenceException: loopElement')
  1327. else:
  1328. element = loopElement.find_element(By.XPATH,
  1329. p["relativeXPath"][1:])
  1330. self.recordLog(
  1331. 'StaleElementReferenceException: loopElement+relativeXPath')
  1332. else:
  1333. element = self.browser.find_element(
  1334. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1335. self.recordLog(
  1336. 'StaleElementReferenceException: relativeXPath')
  1337. content = self.get_content(p, element)
  1338. except StaleElementReferenceException:
  1339. self.recordLog(
  1340. 'StaleElementReferenceException: '+p["relativeXPath"])
  1341. continue # 再出现类似问题直接跳过
  1342. self.outputParameters[p["name"]] = content
  1343. self.execute_code(
  1344. 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
  1345. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  1346. self.OUTPUT.append(line)
  1347. # rt.end()
  1348. if __name__ == '__main__':
  1349. # from multiprocessing import freeze_support
  1350. # freeze_support() # 防止无限死循环多开
  1351. config = {
  1352. "id": [0],
  1353. "saved_file_name": "",
  1354. "user_data": False,
  1355. "config_folder": "",
  1356. "config_file_name": "config.json",
  1357. "read_type": "remote",
  1358. "headless": False,
  1359. "server_address": "http://localhost:8074",
  1360. "version": "0.3.5",
  1361. }
  1362. c = Config(config)
  1363. print(c)
  1364. options = Options()
  1365. driver_path = "chromedriver.exe"
  1366. import platform
  1367. print(sys.platform, platform.architecture())
  1368. option = webdriver.ChromeOptions()
  1369. if not os.path.exists(os.getcwd()+"/Data"):
  1370. os.mkdir(os.getcwd()+"/Data")
  1371. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  1372. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1373. # MacOS需要用option而不是options!
  1374. option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1375. option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1376. options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1377. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  1378. # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1379. # # MacOS需要用option而不是options!
  1380. # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1381. # driver_path = os.getcwd()+ "/chromedriver_mac64"
  1382. print(driver_path)
  1383. if c.config_folder == "":
  1384. c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
  1385. # print("Config folder for MacOS:", c.config_folder)
  1386. elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
  1387. print("Finding chromedriver in EasySpider",
  1388. os.getcwd()+"/EasySpider")
  1389. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  1390. options.binary_location = os.path.join(
  1391. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1392. driver_path = os.path.join(
  1393. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  1394. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1395. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1396. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  1397. options.binary_location = os.path.join(
  1398. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1399. driver_path = os.path.join(
  1400. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  1401. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1402. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1403. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  1404. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1405. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  1406. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1407. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1408. else:
  1409. print("Unsupported platform")
  1410. sys.exit()
  1411. print("Chrome location:", options.binary_location)
  1412. print("Chromedriver location:", driver_path)
  1413. # elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
  1414. # print("Finding chromedriver in ./Chrome",
  1415. # os.getcwd()+"/Chrome")
  1416. # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
  1417. # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
  1418. # driver_path = "./Chrome/chromedriver.exe"
  1419. elif os.path.exists(os.getcwd()+"/../ElectronJS"):
  1420. # 软件dev用
  1421. print("Finding chromedriver in EasySpider",
  1422. os.getcwd()+"/ElectronJS")
  1423. option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1424. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  1425. option.add_extension("../ElectronJS/XPathHelper.crx")
  1426. else:
  1427. options.binary_location = "./chrome.exe" # 指定chrome位置
  1428. driver_path = "./chromedriver.exe"
  1429. option.add_extension("XPathHelper.crx")
  1430. option.add_experimental_option(
  1431. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  1432. options.add_argument('-ignore-certificate-errors')
  1433. options.add_argument('-ignore -ssl-errors')
  1434. option.add_argument('-ignore-certificate-errors')
  1435. option.add_argument('-ignore -ssl-errors')
  1436. # user_data_dir = r'' # 注意没有Default!
  1437. # options.add_argument('--user-data-dir='+p)
  1438. # 总结:
  1439. # 0. 带Cookie需要用userdatadir
  1440. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  1441. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  1442. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  1443. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  1444. try:
  1445. with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
  1446. config = json.load(f)
  1447. print("Config file path: " + c.config_folder + c.config_file_name)
  1448. absolute_user_data_folder = config["absolute_user_data_folder"]
  1449. print("\nAbsolute_user_data_folder:",
  1450. absolute_user_data_folder, "\n")
  1451. except:
  1452. pass
  1453. if c.user_data:
  1454. option.add_argument(
  1455. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1456. option.add_argument("--profile-directory=Default")
  1457. options.add_argument(
  1458. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1459. options.add_argument("--profile-directory=Default")
  1460. if c.headless:
  1461. print("Headless mode")
  1462. print("无头模式")
  1463. option.add_argument("--headless")
  1464. options.add_argument("--headless")
  1465. # options.add_argument(
  1466. # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
  1467. option.add_argument(
  1468. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1469. options.add_argument(
  1470. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1471. threads = []
  1472. for i in c.id:
  1473. # print(options)
  1474. print("id: ", i)
  1475. if c.read_type == "remote":
  1476. print("remote")
  1477. content = requests.get(
  1478. c.server_address + "/queryExecutionInstance?id=" + str(i))
  1479. service = json.loads(content.text) # 加载服务信息
  1480. else:
  1481. print("local")
  1482. with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f:
  1483. content = f.read()
  1484. service = json.loads(content) # 加载服务信息
  1485. print("Task Name:", service["name"])
  1486. print("任务名称:", service["name"])
  1487. try:
  1488. cloudflare = service["cloudflare"]
  1489. except:
  1490. cloudflare = 0
  1491. if cloudflare == 0:
  1492. options.add_experimental_option("prefs", {
  1493. # 设置文件下载路径
  1494. "download.default_directory": "Data/Task_" + str(i),
  1495. "download.prompt_for_download": False, # 禁止下载提示框
  1496. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1497. "download.directory_upgrade": True,
  1498. "download.extensions_to_open": "applications/pdf",
  1499. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1500. })
  1501. option.add_experimental_option("prefs", {
  1502. # 设置文件下载路径
  1503. "download.default_directory": "Data/Task_" + str(i),
  1504. "download.prompt_for_download": False, # 禁止下载提示框
  1505. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1506. "download.directory_upgrade": True,
  1507. "download.extensions_to_open": "applications/pdf",
  1508. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1509. })
  1510. try:
  1511. if service["environment"] == 1:
  1512. option.add_experimental_option(
  1513. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1514. options.add_experimental_option(
  1515. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1516. except:
  1517. pass
  1518. browser_t = MyChrome(
  1519. options=options, chrome_options=option, executable_path=driver_path)
  1520. elif cloudflare == 1:
  1521. if sys.platform != "darwin":
  1522. browser_t = MyUCChrome(
  1523. options=options, chrome_options=option, driver_executable_path=driver_path)
  1524. print("Pass Cloudflare Mode")
  1525. print("过Cloudflare验证模式")
  1526. else:
  1527. print("Not support Cloudflare Mode on MacOS")
  1528. print("MacOS不支持Cloudflare验证模式")
  1529. sys.exit()
  1530. event = Event()
  1531. event.set()
  1532. thread = BrowserThread(browser_t, i, service,
  1533. c.version, event, c.saved_file_name, config=config)
  1534. print("Thread with task id: ", i, " is created")
  1535. threads.append(thread)
  1536. thread.start()
  1537. # Set the pause operation
  1538. # if sys.platform != "linux":
  1539. # time.sleep(3)
  1540. # print("\n\n----------------------------------")
  1541. # print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1542. # print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1543. # print("----------------------------------\n\n")
  1544. # Thread(target=check_pause, args=("p", event)).start()
  1545. # else:
  1546. time.sleep(3)
  1547. press_time = {"duration": 0, "is_pressed": False}
  1548. print("\n\n----------------------------------")
  1549. print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1550. print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1551. print("----------------------------------\n\n")
  1552. # 使用监听器监听键盘输入
  1553. try:
  1554. with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
  1555. listener.join()
  1556. except:
  1557. pass
  1558. # print("您的操作系统不支持暂停功能。")
  1559. # print("Your operating system does not support the pause function.")
  1560. # print("线程长度:", len(threads) )
  1561. for thread in threads:
  1562. print()
  1563. thread.join()
  1564. for thread in threads:
  1565. thread.browser.quit()
  1566. # print("Thread with task id: ", thread.id, " is closed")
  1567. print("程序已运行完成,请手动关闭此窗口。")
  1568. print("The program has finished running, please manually close this window.")