easyspider_executestage.py 89 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720
  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. from datetime import datetime
  4. import io # 遇到错误退出时应执行的代码
  5. import json
  6. # from lib2to3.pgen2 import driver
  7. import re
  8. # import shutil
  9. import subprocess
  10. import sys
  11. # from urllib import parse
  12. # import base64
  13. # import hashlib
  14. import time
  15. import requests
  16. from urllib.parse import urljoin
  17. from lxml import etree
  18. # import undetected_chromedriver as uc
  19. from pynput.keyboard import Key, Listener
  20. from selenium.webdriver.chrome.options import Options
  21. from selenium.webdriver.common.keys import Keys
  22. from selenium.webdriver.common.action_chains import ActionChains
  23. from selenium import webdriver
  24. from selenium.webdriver.support.ui import WebDriverWait
  25. from selenium.webdriver.support import expected_conditions as EC
  26. from selenium.webdriver.common.by import By
  27. from selenium.common.exceptions import NoSuchElementException
  28. from selenium.common.exceptions import TimeoutException
  29. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  30. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  31. from selenium.webdriver.support.ui import Select
  32. from selenium.webdriver import ActionChains
  33. from selenium.webdriver.common.by import By
  34. import random
  35. # import pandas as pd
  36. from openpyxl import load_workbook, Workbook
  37. # import numpy
  38. import csv
  39. import os
  40. from commandline_config import Config
  41. import pytesseract
  42. from PIL import Image
  43. # import uuid
  44. from threading import Thread, Event
  45. from myChrome import MyChrome
  46. if sys.platform != "darwin":
  47. from myChrome import MyUCChrome
  48. from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
  49. desired_capabilities = DesiredCapabilities.CHROME
  50. desired_capabilities["pageLoadStrategy"] = "none"
  51. class BrowserThread(Thread):
  52. def __init__(self, browser_t, id, service, version, event, saveName, config):
  53. Thread.__init__(self)
  54. self.browser = browser_t
  55. self.config = config
  56. self.id = id
  57. self.event = event
  58. try:
  59. self.saveName = service["saveName"] # 保存文件的名字
  60. except:
  61. now = datetime.now()
  62. # 将时间格式化为精确到秒的字符串
  63. self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
  64. self.log = ""
  65. self.OUTPUT = ""
  66. self.SAVED = False
  67. self.BREAK = False
  68. self.CONTINUE = False
  69. # 名称设定
  70. if saveName != "": # 命令行覆盖保存名称
  71. self.saveName = saveName # 保存文件的名字
  72. now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  73. self.saveName = self.saveName.replace("current_time", now)
  74. print("Save Name for task ID", i, "is:", self.saveName)
  75. print("任务ID", i, "的保存文件名为:", self.saveName)
  76. if not os.path.exists("Data/Task_" + str(i)):
  77. os.mkdir("Data/Task_" + str(i))
  78. if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
  79. os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
  80. stealth_path = driver_path[:driver_path.find(
  81. "chromedriver")] + "stealth.min.js"
  82. with open(stealth_path, 'r') as f:
  83. js = f.read()
  84. print("Loading stealth.min.js")
  85. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  86. 'source': js}) # TMALL 反扒
  87. WebDriverWait(self.browser, 10)
  88. self.browser.get('about:blank')
  89. self.procedure = service["graph"] # 程序执行流程
  90. try:
  91. self.maxViewLength = service["maxViewLength"] # 最大显示长度
  92. except:
  93. self.maxViewLength = 15
  94. try:
  95. self.outputFormat = service["outputFormat"] # 输出格式
  96. except:
  97. self.outputFormat = "csv"
  98. try:
  99. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  100. pass
  101. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  102. if service["version"] != version:
  103. print("版本不一致,请使用" +
  104. service["version"] + "版本的EasySpider运行该任务!")
  105. print("Version not match, please use EasySpider " +
  106. service["version"] + " to run this task!")
  107. self.browser.quit()
  108. sys.exit()
  109. except: # 0.2.0版本没有version字段,所以直接退出
  110. print("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  111. print("Version not match, please use EasySpider v0.2.0 to run this task!")
  112. self.browser.quit()
  113. sys.exit()
  114. try:
  115. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  116. except:
  117. self.save_threshold = 10
  118. self.links = list(
  119. filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
  120. self.OUTPUT = [] # 采集的数据
  121. self.writeMode = 1 # 写入模式,0为新建,1为追加
  122. if self.outputFormat == "csv" or self.outputFormat == "txt":
  123. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  124. self.OUTPUT.append([]) # 添加表头
  125. self.writeMode = 0
  126. elif self.outputFormat == "xlsx":
  127. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
  128. self.OUTPUT.append([]) # 添加表头
  129. self.writeMode = 0
  130. elif self.outputFormat == "mysql":
  131. self.mysql = myMySQL(config["mysql_config_path"])
  132. self.mysql.create_table(self.saveName, service["outputParameters"])
  133. self.writeMode = 2
  134. if self.writeMode == 1:
  135. print("追加模式")
  136. print("Append Mode")
  137. elif self.writeMode == 0:
  138. print("新建模式")
  139. print("New Mode")
  140. elif self.writeMode == 2:
  141. print("MySQL模式")
  142. print("MySQL Mode")
  143. self.containJudge = service["containJudge"] # 是否含有判断语句
  144. self.outputParameters = {}
  145. self.outputParametersTypes = []
  146. self.outputParametersRecord = [] # 字段是否被记录
  147. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  148. self.log = "" # 记下现在总共开了多少个标签页
  149. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  150. self.SAVED = False # 记录是否已经存储了
  151. for para in service["outputParameters"]: # 初始化输出参数
  152. if para["name"] not in self.outputParameters.keys():
  153. self.outputParameters[para["name"]] = ""
  154. self.dataNotFoundKeys[para["name"]] = False
  155. try:
  156. self.outputParametersTypes.append(para["type"])
  157. except:
  158. self.outputParametersTypes.append("text")
  159. try:
  160. self.outputParametersRecord.append(bool(para["recordASField"]))
  161. except:
  162. self.outputParametersRecord.append(True)
  163. # 文件叠加的时候不添加表头
  164. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  165. if self.writeMode == 0:
  166. self.OUTPUT[0].append(para["name"])
  167. self.urlId = 0 # 全局记录变量
  168. self.preprocess() # 预处理,优化提取数据流程
  169. # 检测如果没有复杂的操作,优化提取数据流程
  170. def preprocess(self):
  171. for node in self.procedure:
  172. try:
  173. iframe = node["parameters"]["iframe"]
  174. except:
  175. node["parameters"]["iframe"] = False
  176. try:
  177. node["parameters"]["xpath"] = lowercase_tags_in_xpath(
  178. node["parameters"]["xpath"])
  179. except:
  180. pass
  181. if node["option"] == 1: # 打开网页操作
  182. try:
  183. cookies = node["parameters"]["cookies"]
  184. except:
  185. node["parameters"]["cookies"] = ""
  186. elif node["option"] == 3: # 提取数据操作
  187. node["parameters"]["recordASField"] = 0
  188. paras = node["parameters"]["paras"]
  189. try:
  190. clear = node["parameters"]["clear"]
  191. except:
  192. node["parameters"]["clear"] = 0
  193. for para in paras:
  194. try:
  195. iframe = para["iframe"]
  196. except:
  197. para["iframe"] = False
  198. try:
  199. para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
  200. except:
  201. pass
  202. try:
  203. node["parameters"]["recordASField"] += para["recordASField"]
  204. except:
  205. node["parameters"]["recordASField"] += 1
  206. if para["contentType"] == 8:
  207. print("默认的OCR识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
  208. print("If you think the default OCR function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
  209. if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
  210. para["optimizable"] = True
  211. else:
  212. para["optimizable"] = False
  213. elif node["option"] == 4: # 输入文字
  214. try:
  215. index = node["parameters"]["index"] # 索引值
  216. except:
  217. node["parameters"]["index"] = 0
  218. elif node["option"] == 5: # 自定义操作
  219. try:
  220. clear = node["parameters"]["clear"]
  221. except:
  222. node["parameters"]["clear"] = 0
  223. def run(self):
  224. # 挨个执行程序
  225. for i in range(len(self.links)):
  226. print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
  227. print("Executing link", i + 1, "/ ", len(self.links))
  228. self.executeNode(0)
  229. self.urlId = self.urlId + 1
  230. files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  231. # 如果目录为空,则删除该目录
  232. if not files:
  233. os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  234. print("Done!")
  235. print("执行完成!")
  236. self.recordLog("Done!")
  237. self.saveData(exit=True)
  238. if self.outputFormat == "mysql":
  239. self.mysql.close()
  240. def recordLog(self, str=""):
  241. self.log = self.log + str + "\n"
  242. # 控制台打印log函数
  243. def Log(self, text, text2=""):
  244. switch = False
  245. if switch:
  246. print(text, text2)
  247. # @atexit.register
  248. # def clean(self):
  249. # self.saveData(exit=True)
  250. # self.browser.quit()
  251. # sys.exit(0)
  252. def saveData(self, exit=False):
  253. # 每save_threshold条保存一次
  254. if exit == True or len(self.OUTPUT) >= self.save_threshold:
  255. # 写入日志
  256. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
  257. file_obj.write(self.log)
  258. file_obj.close()
  259. # 写入数据
  260. if self.outputFormat == "csv" or self.outputFormat == "txt":
  261. file_name = "Data/Task_" + \
  262. str(self.id) + "/" + self.saveName + '.' + self.outputFormat
  263. write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
  264. elif self.outputFormat == "xlsx":
  265. file_name = "Data/Task_" + \
  266. str(self.id) + "/" + self.saveName + '.xlsx'
  267. write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
  268. elif self.outputFormat == "mysql":
  269. self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
  270. self.OUTPUT = []
  271. self.log = ""
  272. def scrollDown(self, para, rt=""):
  273. try:
  274. time.sleep(para["scrollWaitTime"]) # 下拉前等待
  275. except:
  276. pass
  277. scrollType = int(para["scrollType"])
  278. try:
  279. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  280. if scrollType == 1 or scrollType == 2:
  281. for i in range(para["scrollCount"]):
  282. self.Log("Wait for set second after screen scrolling")
  283. body = self.browser.find_element(
  284. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  285. if scrollType == 1:
  286. body.send_keys(Keys.PAGE_DOWN)
  287. elif scrollType == 2:
  288. body.send_keys(Keys.END)
  289. try:
  290. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  291. except:
  292. pass
  293. elif scrollType == 3:
  294. bodyText = ""
  295. i = 0
  296. while True:
  297. # newBodyText = self.browser.page_source
  298. newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
  299. if newBodyText == bodyText:
  300. print("页面已检测不到新内容,停止滚动。")
  301. print("No new content detected on the page, stop scrolling.")
  302. break
  303. else:
  304. bodyText = newBodyText
  305. body = self.browser.find_element(
  306. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  307. body.send_keys(Keys.END)
  308. print("滚动到底部,第", i + 1, "次。")
  309. print("Scroll to the bottom, the", i + 1, "time.")
  310. i = i + 1
  311. try:
  312. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  313. except:
  314. pass
  315. except:
  316. self.Log('Time out after set seconds when scrolling. ')
  317. self.recordLog('Time out after set seconds when scrolling')
  318. try:
  319. self.browser.execute_script('window.stop()')
  320. except:
  321. pass
  322. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  323. for i in range(para["scrollCount"]):
  324. self.Log("Wait for set second after screen scrolling")
  325. body = self.browser.find_element(
  326. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  327. if scrollType == 1:
  328. body.send_keys(Keys.PGDN)
  329. elif scrollType == 2:
  330. body.send_keys(Keys.END)
  331. try:
  332. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  333. except:
  334. pass
  335. if rt != "":
  336. rt.end()
  337. def execute_code(self, codeMode, code, max_wait_time, element=None, iframe=False):
  338. output = ""
  339. if code == "":
  340. return ""
  341. if max_wait_time == 0:
  342. max_wait_time = 999999
  343. # print(codeMode, code)
  344. # 将value中的Field[""]替换为outputParameters中的键值
  345. pattern = r'Field\["([^"]+)"\]'
  346. try:
  347. replaced_text = re.sub(
  348. pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
  349. except:
  350. replaced_text = code
  351. code = replaced_text
  352. if iframe and self.browser.iframe_env == False:
  353. # 获取所有的 iframe
  354. self.browser.switch_to.default_content()
  355. iframes = self.browser.find_elements(
  356. By.CSS_SELECTOR, "iframe", iframe=False)
  357. # 遍历所有的 iframe 并点击里面的元素
  358. for iframe in iframes:
  359. # 切换到 iframe
  360. try:
  361. self.browser.switch_to.default_content()
  362. self.browser.switch_to.frame(iframe)
  363. self.browser.iframe_env = True
  364. break
  365. except:
  366. print("Iframe switch failed")
  367. elif not iframe and self.browser.iframe_env == True:
  368. self.browser.switch_to.default_content()
  369. self.browser.iframe_env = False
  370. if int(codeMode) == 0:
  371. self.recordLog("Execute JavaScript:" + code)
  372. self.recordLog("执行JavaScript:" + code)
  373. self.browser.set_script_timeout(max_wait_time)
  374. try:
  375. output = self.browser.execute_script(code)
  376. except:
  377. output = ""
  378. self.recordLog("JavaScript execution failed")
  379. elif int(codeMode) == 2:
  380. self.recordLog("Execute JavaScript for element:" + code)
  381. self.recordLog("对元素执行JavaScript:" + code)
  382. self.browser.set_script_timeout(max_wait_time)
  383. try:
  384. output = self.browser.execute_script(code, element)
  385. except:
  386. output = ""
  387. self.recordLog("JavaScript execution failed")
  388. elif int(codeMode) == 1:
  389. self.recordLog("Execute System Call:" + code)
  390. self.recordLog("执行系统命令:" + code)
  391. # 执行系统命令
  392. try:
  393. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  394. output = subprocess.run(
  395. code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  396. # 输出命令返回值
  397. output = output.stdout
  398. print(output)
  399. except subprocess.TimeoutExpired:
  400. # 命令执行时间超过指定值,抛出异常
  401. self.recordLog("Command timed out")
  402. self.recordLog("命令执行超时")
  403. except Exception as e:
  404. print(e) # 打印异常信息
  405. self.recordLog("Command execution failed")
  406. self.recordLog("命令执行失败")
  407. return str(output)
  408. def customOperation(self, node, loopValue, loopPath, index):
  409. paras = node["parameters"]
  410. if paras["clear"] == 1:
  411. self.clearOutputParameters()
  412. codeMode = int(paras["codeMode"])
  413. code = paras["code"]
  414. output = ""
  415. max_wait_time = int(paras["waitTime"])
  416. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  417. try:
  418. elements = self.browser.find_elements(
  419. By.XPATH, loopPath, iframe=paras["iframe"])
  420. element = elements[index]
  421. output = self.execute_code(
  422. codeMode, code, max_wait_time, element, iframe=paras["iframe"])
  423. except:
  424. output = ""
  425. print("JavaScript execution failed")
  426. elif codeMode == 3:
  427. self.BREAK = True
  428. elif codeMode == 4:
  429. self.CONTINUE = True
  430. else: # 0 1
  431. output = self.execute_code(
  432. codeMode, code, max_wait_time, iframe=paras["iframe"])
  433. recordASField = bool(paras["recordASField"])
  434. # if recordASField:
  435. # print("操作<" + node["title"] + ">的返回值为:" + output)
  436. # print("The return value of operation <" + node["title"] + "> is: " + output)
  437. self.outputParameters[node["title"]] = output
  438. if recordASField:
  439. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  440. self.OUTPUT.append(line)
  441. def switchSelect(self, para, loopValue):
  442. optionMode = int(para["optionMode"])
  443. optionValue = para["optionValue"]
  444. try:
  445. dropdown = Select(self.browser.find_element(
  446. By.XPATH, para["xpath"], iframe=para["iframe"]))
  447. try:
  448. if optionMode == 0:
  449. # 获取当前选中的选项索引
  450. current_index = dropdown.options.index(
  451. dropdown.first_selected_option)
  452. # 计算下一个选项的索引
  453. next_index = (current_index + 1) % len(dropdown.options)
  454. # 选择下一个选项
  455. dropdown.select_by_index(next_index)
  456. elif optionMode == 1:
  457. dropdown.select_by_index(int(optionValue))
  458. elif optionMode == 2:
  459. dropdown.select_by_value(optionValue)
  460. elif optionMode == 3:
  461. dropdown.select_by_visible_text(optionValue)
  462. except:
  463. print("切换下拉框选项失败:", para["xpath"],
  464. para["optionMode"], para["optionValue"])
  465. print("Failed to change drop-down box option:",
  466. para["xpath"], para["optionMode"], para["optionValue"])
  467. except:
  468. print("找不到下拉框元素:", para["xpath"])
  469. print("Cannot find drop-down box element:", para["xpath"])
  470. def moveToElement(self, para, loopElement=None, loopPath="", index=0):
  471. time.sleep(0.1) # 移动之前等待0.1秒
  472. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  473. path = loopPath
  474. # element = loopElement
  475. else:
  476. index = 0
  477. path = para["xpath"] # 不然使用元素定义的xpath
  478. # element = self.browser.find_element(
  479. # By.XPATH, path, iframe=para["iframe"])
  480. try:
  481. elements = self.browser.find_elements(
  482. By.XPATH, path, iframe=para["iframe"])
  483. element = elements[index]
  484. try:
  485. ActionChains(self.browser).move_to_element(element).perform()
  486. except:
  487. print("移动鼠标到元素失败:", para["xpath"])
  488. print("Failed to move mouse to element:", para["xpath"])
  489. except:
  490. print("找不到元素:", para["xpath"])
  491. print("Cannot find element:", para["xpath"])
  492. # 执行节点关键函数部分
  493. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  494. node = self.procedure[nodeId]
  495. WebDriverWait(self.browser, 10).until
  496. # 等待元素出现才进行操作,10秒内未出现则报错
  497. (EC.visibility_of_element_located(
  498. (By.XPATH, node["parameters"]["xpath"])))
  499. # 根据不同选项执行不同操作
  500. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  501. for i in node["sequence"]: # 从根节点开始向下读取
  502. self.executeNode(i, loopValue, loopPath, index)
  503. elif node["option"] == 1: # 打开网页操作
  504. self.recordLog("openPage")
  505. self.openPage(node["parameters"], loopValue)
  506. elif node["option"] == 2: # 点击元素
  507. self.recordLog("Click")
  508. self.clickElement(node["parameters"], loopValue, loopPath, index)
  509. elif node["option"] == 3: # 提取数据
  510. self.recordLog("getData")
  511. self.getData(node["parameters"], loopValue, node["isInLoop"],
  512. parentPath=loopPath, index=index)
  513. self.saveData()
  514. elif node["option"] == 4: # 输入文字
  515. self.inputInfo(node["parameters"], loopValue)
  516. elif node["option"] == 5: # 自定义操作
  517. self.customOperation(node, loopValue, loopPath, index)
  518. self.saveData()
  519. elif node["option"] == 6: # 切换下拉框
  520. self.switchSelect(node["parameters"], loopValue)
  521. elif node["option"] == 7: # 鼠标移动到元素上
  522. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  523. elif node["option"] == 8: # 循环
  524. self.recordLog("loop")
  525. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  526. elif node["option"] == 9: # 条件分支
  527. self.recordLog("judge")
  528. self.judgeExecute(node, loopValue, loopPath, index)
  529. # 执行完之后进行等待
  530. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  531. waitTime = 0.01 # 默认等待0.01秒
  532. if node["parameters"]["wait"] >= 0:
  533. waitTime = node["parameters"]["wait"]
  534. try:
  535. waitType = int(node["parameters"]["waitType"])
  536. except:
  537. waitType = 0
  538. if waitType == 0: # 固定等待时间
  539. time.sleep(waitTime)
  540. elif waitType == 1: # 随机等待时间
  541. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  542. self.Log("Wait seconds after node executing: ", waitTime)
  543. self.event.wait() # 等待事件结束
  544. # 对判断条件的处理
  545. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  546. executeBranchId = 0 # 要执行的BranchId
  547. for i in node["sequence"]:
  548. cnode = self.procedure[i] # 获得条件分支
  549. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  550. if tType == 0: # 什么条件都没有
  551. executeBranchId = i
  552. break
  553. elif tType == 1: # 当前页面包含文本
  554. try:
  555. bodyText = self.browser.find_element(
  556. By.CSS_SELECTOR, "body", iframe=cnode["parameters"]["iframe"]).text
  557. if bodyText.find(cnode["parameters"]["value"]) >= 0:
  558. executeBranchId = i
  559. break
  560. except: # 找不到元素下一个条件
  561. continue
  562. elif tType == 2: # 当前页面包含元素
  563. try:
  564. if self.browser.find_element(By.XPATH, cnode["parameters"]["value"], iframe=cnode["parameters"]["iframe"]):
  565. executeBranchId = i
  566. break
  567. except: # 找不到元素或者xpath写错了,下一个条件
  568. continue
  569. elif tType == 3: # 当前循环元素包括文本
  570. try:
  571. if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
  572. executeBranchId = i
  573. break
  574. except: # 找不到元素或者xpath写错了,下一个条件
  575. continue
  576. elif tType == 4: # 当前循环元素包括元素
  577. try:
  578. if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
  579. executeBranchId = i
  580. break
  581. except: # 找不到元素或者xpath写错了,下一个条件
  582. continue
  583. elif tType <= 7: # JS命令返回值
  584. if tType == 5: # JS命令返回值等于
  585. output = self.execute_code(
  586. 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  587. elif tType == 6: # System
  588. output = self.execute_code(
  589. 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  590. elif tType == 7: # 针对当前循环项的JS命令返回值
  591. output = self.execute_code(
  592. 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"])
  593. try:
  594. if output.find("rue") != -1: # 如果返回值中包含true
  595. code = 1
  596. else:
  597. code = int(output)
  598. except:
  599. code = 0
  600. if code > 0:
  601. executeBranchId = i
  602. break
  603. # rt.end()
  604. if executeBranchId != 0:
  605. self.executeNode(executeBranchId, loopElement, clickPath, index)
  606. # 对循环的处理
  607. def loopExecute(self, node, loopValue, clickPath="", index=0):
  608. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  609. # self.Log("循环执行前等待0.1秒")
  610. self.Log("Wait 0.1 second before loop")
  611. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  612. thisHistoryLength = self.browser.execute_script(
  613. 'return history.length') # 记录本次循环内的history的length
  614. self.history["index"] = thisHistoryLength
  615. self.history["handle"] = thisHandle
  616. if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  617. # 无跳转标签页操作
  618. count = 0 # 执行次数
  619. bodyText = "-"
  620. while True: # do while循环
  621. try:
  622. finished = False
  623. # newBodyText = self.browser.page_source
  624. # newBodyText = self.browser.find_element(By.XPATH, "//body").text
  625. newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
  626. if newBodyText == bodyText: # 如果页面内容无变化
  627. print("页面已检测不到新内容,停止循环。")
  628. print("No new content detected on the page, stop loop.")
  629. finished = True
  630. break
  631. else:
  632. if node["parameters"]["exitCount"] == 0:
  633. print("检测到页面变化,继续循环。")
  634. print("Page changed detected, continue loop.")
  635. bodyText = newBodyText
  636. element = self.browser.find_element(
  637. By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  638. for i in node["sequence"]: # 挨个执行操作
  639. self.executeNode(
  640. i, element, node["parameters"]["xpath"], 0)
  641. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  642. self.CONTINUE = False
  643. break
  644. if self.BREAK: # 如果有break操作,退出循环
  645. self.BREAK = False
  646. finished = True
  647. break
  648. finished = True
  649. self.Log("Click: ", node["parameters"]["xpath"])
  650. self.recordLog("Click:" + node["parameters"]["xpath"])
  651. except NoSuchElementException:
  652. # except:
  653. print("Single loop element not found: ",
  654. node["parameters"]["xpath"])
  655. print("找不到要循环的单个元素: ", node["parameters"]["xpath"])
  656. self.recordLog(
  657. "Single loop element not found: " + node["parameters"]["xpath"])
  658. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  659. if node["option"] != 2:
  660. self.executeNode(
  661. i, None, node["parameters"]["xpath"], 0)
  662. finished = True
  663. break # 如果找不到元素,退出循环
  664. finally:
  665. if not finished:
  666. print("\n\n-------Retrying-------\n\n")
  667. self.Log("-------Retrying-------: ",
  668. node["parameters"]["xpath"])
  669. self.recordLog("ClickNotFound:" +
  670. node["parameters"]["xpath"])
  671. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  672. if node["option"] != 2:
  673. self.executeNode(
  674. i, None, node["parameters"]["xpath"], 0)
  675. break # 如果找不到元素,退出循环
  676. count = count + 1
  677. self.Log("Page: ", count)
  678. self.recordLog("Page:" + str(count))
  679. # print(node["parameters"]["exitCount"], "-------")
  680. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  681. break
  682. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  683. output = self.execute_code(int(
  684. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  685. code = get_output_code(output)
  686. if code <= 0:
  687. break
  688. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  689. try:
  690. elements = self.browser.find_elements(By.XPATH,
  691. node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  692. if len(elements) == 0:
  693. print("Loop element not found: ",
  694. node["parameters"]["xpath"])
  695. print("找不到循环元素: ", node["parameters"]["xpath"])
  696. self.recordLog("pathNotFound: " +
  697. node["parameters"]["xpath"])
  698. for index in range(len(elements)):
  699. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  700. self.executeNode(i, elements[index],
  701. node["parameters"]["xpath"], index)
  702. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  703. self.CONTINUE = False
  704. break
  705. if self.BREAK:
  706. self.BREAK = False
  707. break
  708. try:
  709. changed_handle = self.browser.current_window_handle != thisHandle
  710. except: # 如果网页被意外关闭了的情况下
  711. self.browser.switch_to.window(
  712. self.browser.window_handles[-1])
  713. changed_handle = self.browser.window_handles[-1] != thisHandle
  714. if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
  715. try:
  716. while True: # 一直关闭窗口直到当前标签页
  717. self.browser.close() # 关闭使用完的标签页
  718. self.browser.switch_to.window(
  719. self.browser.window_handles[-1])
  720. if self.browser.current_window_handle == thisHandle:
  721. break
  722. except Exception as e:
  723. print("关闭标签页发生错误:", e)
  724. print("Error occurred while closing tab: ", e)
  725. if self.history["index"] != thisHistoryLength and self.history[
  726. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  727. difference = thisHistoryLength - \
  728. self.history["index"] # 计算历史记录变化差值
  729. self.browser.execute_script(
  730. 'history.go(' + str(difference) + ')') # 回退历史记录
  731. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  732. time.sleep(node["parameters"]["historyWait"])
  733. # else:
  734. # time.sleep(2)
  735. # 切换历史记录等待:
  736. self.Log("Change history back time or:",
  737. node["parameters"]["historyWait"])
  738. try:
  739. self.browser.execute_script('window.stop()')
  740. except:
  741. pass
  742. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  743. output = self.execute_code(int(
  744. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  745. code = get_output_code(output)
  746. if code <= 0:
  747. break
  748. except NoSuchElementException:
  749. print("Loop element not found: ", node["parameters"]["xpath"])
  750. print("找不到循环元素: ", node["parameters"]["xpath"])
  751. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  752. except Exception as e:
  753. raise
  754. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  755. # 千万不要忘了分割!!
  756. for path in node["parameters"]["pathList"].split("\n"):
  757. try:
  758. element = self.browser.find_element(
  759. By.XPATH, path, iframe=node["parameters"]["iframe"])
  760. for i in node["sequence"]: # 挨个执行操作
  761. self.executeNode(i, element, path, 0)
  762. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  763. self.CONTINUE = False
  764. break
  765. if self.BREAK:
  766. self.BREAK = False
  767. break
  768. try:
  769. changed_handle = self.browser.current_window_handle != thisHandle
  770. except: # 如果网页被意外关闭了的情况下
  771. self.browser.switch_to.window(
  772. self.browser.window_handles[-1])
  773. changed_handle = self.browser.window_handles[-1] != thisHandle
  774. if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
  775. try:
  776. while True: # 一直关闭窗口直到当前标签页
  777. self.browser.close() # 关闭使用完的标签页
  778. self.browser.switch_to.window(
  779. self.browser.window_handles[-1])
  780. if self.browser.current_window_handle == thisHandle:
  781. break
  782. except Exception as e:
  783. print("关闭标签页发生错误:", e)
  784. print("Error occurred while closing tab: ", e)
  785. if self.history["index"] != thisHistoryLength and self.history[
  786. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  787. difference = thisHistoryLength - \
  788. self.history["index"] # 计算历史记录变化差值
  789. self.browser.execute_script(
  790. 'history.go(' + str(difference) + ')') # 回退历史记录
  791. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  792. time.sleep(node["parameters"]["historyWait"])
  793. # else:
  794. # time.sleep(2)
  795. self.Log("Change history back time or:",
  796. node["parameters"]["historyWait"])
  797. try:
  798. self.browser.execute_script('window.stop()')
  799. except:
  800. pass
  801. except NoSuchElementException:
  802. print("Loop element not found: ", path)
  803. print("找不到循环元素: ", path)
  804. self.recordLog("pathNotFound: " + path)
  805. continue # 循环中找不到元素就略过操作
  806. except Exception as e:
  807. raise
  808. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  809. output = self.execute_code(int(
  810. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  811. code = get_output_code(output)
  812. if code <= 0:
  813. break
  814. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  815. textList = node["parameters"]["textList"].split("\n")
  816. for text in textList:
  817. self.recordLog("input: " + text)
  818. for i in node["sequence"]: # 挨个执行操作
  819. self.executeNode(i, text, "", 0)
  820. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  821. self.CONTINUE = False
  822. break
  823. if self.BREAK:
  824. self.BREAK = False
  825. break
  826. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  827. output = self.execute_code(int(
  828. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  829. code = get_output_code(output)
  830. if code <= 0:
  831. break
  832. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  833. # tempList = node["parameters"]["textList"].split("\r\n")
  834. urlList = list(
  835. filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
  836. # urlList = []
  837. # for url in tempList:
  838. # if url != "":
  839. # urlList.append(url)
  840. for url in urlList:
  841. self.recordLog("input: " + url)
  842. for i in node["sequence"]:
  843. self.executeNode(i, url, "", 0)
  844. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  845. self.CONTINUE = False
  846. break
  847. if self.BREAK:
  848. self.BREAK = False
  849. break
  850. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  851. output = self.execute_code(int(
  852. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  853. code = get_output_code(output)
  854. if code <= 0:
  855. break
  856. elif int(node["parameters"]["loopType"]) <= 6: # 命令返回值
  857. while True: # do while循环
  858. if int(node["parameters"]["loopType"]) == 5: # JS
  859. output = self.execute_code(
  860. 0, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  861. elif int(node["parameters"]["loopType"]) == 6: # System
  862. output = self.execute_code(
  863. 1, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  864. code = get_output_code(output)
  865. if code <= 0:
  866. break
  867. for i in node["sequence"]: # 挨个执行操作
  868. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  869. if self.BREAK or self.CONTINUE: # 如果有break操作,下面的操作不执行
  870. self.CONTINUE = False
  871. break
  872. if self.BREAK:
  873. self.BREAK = False
  874. break
  875. self.history["index"] = thisHistoryLength
  876. self.history["handle"] = self.browser.current_window_handle
  877. self.scrollDown(node["parameters"])
  878. # 打开网页事件
  879. def openPage(self, para, loopValue):
  880. time.sleep(1) # 打开网页后强行等待至少1秒
  881. if len(self.browser.window_handles) > 1:
  882. self.browser.switch_to.window(
  883. self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  884. try:
  885. self.browser.close()
  886. except:
  887. pass
  888. self.browser.switch_to.window(
  889. self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  890. self.history["handle"] = self.browser.current_window_handle
  891. if para["useLoop"]:
  892. url = loopValue
  893. elif para["url"] != "about:blank":
  894. url = self.links[self.urlId]
  895. # clear output parameters
  896. for key in self.outputParameters:
  897. self.outputParameters[key] = ""
  898. else:
  899. url = list(filter(isnull, para["links"].split("\n")))[0]
  900. # 将value中的Field[""]替换为outputParameters中的键值
  901. pattern = r'Field\["([^"]+)"\]'
  902. try:
  903. replaced_text = re.sub(
  904. pattern, lambda match: self.outputParameters.get(match.group(1), ''), url)
  905. except:
  906. replaced_text = url
  907. url = replaced_text
  908. try:
  909. maxWaitTime = int(para["maxWaitTime"])
  910. except:
  911. maxWaitTime = 10 # 默认最大等待时间为10秒
  912. try:
  913. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  914. self.browser.set_script_timeout(maxWaitTime)
  915. self.browser.get(url)
  916. if para["cookies"] != "":
  917. self.browser.delete_all_cookies() # 清除所有已有cookie
  918. cookies = para["cookies"].split('\n')
  919. for cookie in cookies:
  920. name, value = cookie.split('=', 1)
  921. cookie_dict = {'name': name, 'value': value}
  922. # 加载 cookie
  923. self.browser.add_cookie(cookie_dict)
  924. self.Log('Loading page: ' + url)
  925. self.recordLog('Loading page: ' + url)
  926. except TimeoutException:
  927. self.Log('Time out after set seconds when loading page: ' + url)
  928. self.recordLog(
  929. 'Time out after set seconds when loading page: ' + url)
  930. try:
  931. self.browser.execute_script('window.stop()')
  932. except:
  933. pass
  934. except Exception as e:
  935. print("Failed to load page: " + url)
  936. self.recordLog('Failed to load page: ' + url)
  937. try:
  938. self.history["index"] = self.browser.execute_script(
  939. "return history.length")
  940. except TimeoutException:
  941. try:
  942. self.browser.execute_script('window.stop()')
  943. self.history["index"] = self.browser.execute_script(
  944. "return history.length")
  945. except:
  946. self.history["index"] = 0
  947. self.scrollDown(para) # 控制屏幕向下滚动
  948. # 键盘输入事件
  949. def inputInfo(self, para, loopValue):
  950. time.sleep(0.1) # 输入之前等待0.1秒
  951. self.Log("Wait 0.1 second before input")
  952. try:
  953. textbox = self.browser.find_element(
  954. By.XPATH, para["xpath"], iframe=para["iframe"])
  955. # textbox.send_keys(Keys.CONTROL, 'a')
  956. # textbox.send_keys(Keys.BACKSPACE)
  957. self.execute_code(
  958. 2, para["beforeJS"], para["beforeJSWaitTime"], textbox, iframe=para["iframe"]) # 执行前置JS
  959. # Send the HOME key
  960. textbox.send_keys(Keys.HOME)
  961. # Send the SHIFT + END key combination
  962. textbox.send_keys(Keys.SHIFT, Keys.END)
  963. # Send the DELETE key
  964. textbox.send_keys(Keys.DELETE)
  965. value = ""
  966. if para["useLoop"]:
  967. value = loopValue
  968. else:
  969. value = para["value"]
  970. # 将value中的Field[""]替换为outputParameters中的键值
  971. pattern = r'Field\["([^"]+)"\]'
  972. try:
  973. replaced_text = re.sub(
  974. pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  975. replaced_text = re.sub(
  976. '<enter>', '', replaced_text, flags=re.IGNORECASE)
  977. except:
  978. replaced_text = value
  979. index = para["index"]
  980. if index != 0:
  981. try:
  982. replaced_text = replaced_text.split("~")[index - 1]
  983. except:
  984. print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
  985. print("Failed to get value, maybe because the index is out of range, will use the entire text value")
  986. textbox.send_keys(replaced_text)
  987. if value.lower().find("<enter>") >= 0:
  988. textbox.send_keys(Keys.ENTER)
  989. self.execute_code(
  990. 2, para["afterJS"], para["afterJSWaitTime"], textbox, iframe=para["iframe"]) # 执行后置js
  991. except:
  992. print("Cannot find input box element:" +
  993. para["xpath"] + ", please try to set the wait time before executing this operation")
  994. print("找不到输入框元素:" + para["xpath"] + ",请尝试在执行此操作前设置等待时间")
  995. self.recordLog("Cannot find input box element:" +
  996. para["xpath"] + "Please try to set the wait time before executing this operation")
  997. # 点击元素事件
  998. def clickElement(self, para, loopElement=None, clickPath="", index=0):
  999. try:
  1000. maxWaitTime = int(para["maxWaitTime"])
  1001. except:
  1002. maxWaitTime = 10
  1003. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  1004. self.browser.set_script_timeout(maxWaitTime)
  1005. # 点击前对该元素执行一段JavaScript代码
  1006. try:
  1007. # element = self.browser.find_element(
  1008. # By.XPATH, path, iframe=para["iframe"])
  1009. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  1010. path = clickPath
  1011. # element = loopElement
  1012. else:
  1013. index = 0
  1014. path = para["xpath"] # 不然使用元素定义的xpath
  1015. # element = self.browser.find_element(
  1016. # By.XPATH, path, iframe=para["iframe"])
  1017. elements = self.browser.find_elements(
  1018. By.XPATH, path, iframe=para["iframe"])
  1019. element = elements[index]
  1020. if para["beforeJS"] != "":
  1021. self.execute_code(2, para["beforeJS"],
  1022. para["beforeJSWaitTime"], element, iframe=para["iframe"])
  1023. except:
  1024. print("Cannot find element:" +
  1025. path + ", please try to set the wait time before executing this operation")
  1026. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  1027. self.recordLog("Cannot find element:" +
  1028. path + ", please try to set the wait time before executing this operation")
  1029. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  1030. try:
  1031. click_way = int(para["clickWay"])
  1032. except:
  1033. click_way = 0
  1034. try:
  1035. if click_way == 0: # 用selenium的点击方法
  1036. actions = ActionChains(self.browser) # 实例化一个action对象
  1037. actions.click(element).perform()
  1038. elif click_way == 1: # 用js的点击方法
  1039. script = 'var result = document.evaluate(`' + path + \
  1040. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  1041. self.browser.execute_script(script, str(index)) # 用js的点击方法
  1042. except TimeoutException:
  1043. self.Log('Time out after set seconds when loading clicked page')
  1044. self.recordLog(
  1045. 'Time out after set seconds when loading clicked page')
  1046. try:
  1047. self.browser.execute_script('window.stop()')
  1048. except:
  1049. pass
  1050. except Exception as e:
  1051. self.Log(e)
  1052. self.recordLog(str(e))
  1053. # 点击后对该元素执行一段JavaScript代码
  1054. try:
  1055. if para["afterJS"] != "":
  1056. element = self.browser.find_element(
  1057. By.XPATH, path, iframe=para["iframe"])
  1058. self.execute_code(2, para["afterJS"],
  1059. para["afterJSWaitTime"], element, iframe=para["iframe"])
  1060. except:
  1061. print("Cannot find element:" + path)
  1062. self.recordLog("Cannot find element:" +
  1063. path + ", please try to set the wait time before executing this operation")
  1064. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  1065. waitTime = float(para["wait"]) + 0.01 # 点击之后等待
  1066. try:
  1067. waitType = int(para["waitType"])
  1068. except:
  1069. waitType = 0
  1070. if waitType == 0: # 固定等待时间
  1071. time.sleep(waitTime)
  1072. elif waitType == 1: # 随机等待时间
  1073. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  1074. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  1075. self.browser.switch_to.window(
  1076. self.browser.window_handles[-1]) # 跳转到新的标签页
  1077. self.history["handle"] = self.browser.current_window_handle
  1078. try:
  1079. self.history["index"] = self.browser.execute_script(
  1080. "return history.length")
  1081. except TimeoutException:
  1082. try:
  1083. self.browser.execute_script('window.stop()')
  1084. except:
  1085. pass
  1086. self.history["index"] = self.browser.execute_script(
  1087. "return history.length")
  1088. else:
  1089. try:
  1090. self.history["index"] = self.browser.execute_script(
  1091. "return history.length")
  1092. except TimeoutException:
  1093. try:
  1094. self.browser.execute_script('window.stop()')
  1095. except:
  1096. pass
  1097. self.history["index"] = self.browser.execute_script(
  1098. "return history.length")
  1099. # 如果打开了新窗口,切换到新窗口
  1100. self.scrollDown(para) # 根据参数配置向下滚动
  1101. # rt.end()
  1102. def get_content(self, p, element):
  1103. content = ""
  1104. if p["contentType"] == 0:
  1105. # 先处理特殊节点类型
  1106. if p["nodeType"] == 2:
  1107. if element.get_attribute("href") != None:
  1108. content = element.get_attribute("href")
  1109. else:
  1110. content = ""
  1111. elif p["nodeType"] == 3:
  1112. if element.get_attribute("value") != None:
  1113. content = element.get_attribute("value")
  1114. else:
  1115. content = ""
  1116. elif p["nodeType"] == 4: # 图片
  1117. if element.get_attribute("src") != None:
  1118. content = element.get_attribute("src")
  1119. else:
  1120. content = ""
  1121. try:
  1122. downloadPic = p["downloadPic"]
  1123. except:
  1124. downloadPic = 0
  1125. if downloadPic == 1:
  1126. download_image(content, "Data/Task_" +
  1127. str(self.id) + "/" + self.saveName + "/")
  1128. else: # 普通节点
  1129. content = element.text
  1130. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  1131. if p["nodeType"] == 2:
  1132. if element.get_attribute("href") != None:
  1133. content = element.get_attribute("href")
  1134. else:
  1135. content = ""
  1136. elif p["nodeType"] == 3:
  1137. if element.get_attribute("value") != None:
  1138. content = element.get_attribute("value")
  1139. else:
  1140. content = ""
  1141. elif p["nodeType"] == 4: # 图片
  1142. if element.get_attribute("src") != None:
  1143. content = element.get_attribute("src")
  1144. else:
  1145. content = ""
  1146. try:
  1147. downloadPic = p["downloadPic"]
  1148. except:
  1149. downloadPic = 0
  1150. if downloadPic == 1:
  1151. download_image(content, "Data/Task_" +
  1152. str(self.id) + "/" + self.saveName + "/")
  1153. else:
  1154. command = 'var arr = [];\
  1155. var content = arguments[0];\
  1156. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  1157. if(content.childNodes[i].nodeType === 3){ \
  1158. arr.push(content.childNodes[i].nodeValue);\
  1159. }\
  1160. }\
  1161. var str = arr.join(" "); \
  1162. return str;'
  1163. content = self.browser.execute_script(command, element).replace(
  1164. "\n", "").replace("\\s+", " ")
  1165. elif p["contentType"] == 2:
  1166. content = element.get_attribute('innerHTML')
  1167. elif p["contentType"] == 3:
  1168. content = element.get_attribute('outerHTML')
  1169. elif p["contentType"] == 4:
  1170. # 获取元素的背景图片地址
  1171. bg_url = element.value_of_css_property('background-image')
  1172. # 清除背景图片地址中的多余字符
  1173. bg_url = bg_url.replace('url("', '').replace('")', '')
  1174. content = bg_url
  1175. elif p["contentType"] == 5:
  1176. content = self.browser.current_url
  1177. elif p["contentType"] == 6:
  1178. content = self.browser.title
  1179. elif p["contentType"] == 7:
  1180. # 获取整个网页的高度和宽度
  1181. height = self.browser.execute_script(
  1182. "return document.body.scrollHeight")
  1183. width = self.browser.execute_script(
  1184. "return document.body.scrollWidth")
  1185. # 调整浏览器窗口的大小
  1186. self.browser.set_window_size(width, height)
  1187. element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
  1188. "/" + str(time.time()) + ".png")
  1189. elif p["contentType"] == 8:
  1190. try:
  1191. screenshot = element.screenshot_as_png
  1192. screenshot_stream = io.BytesIO(screenshot)
  1193. # 使用Pillow库打开截图,并转换为灰度图像
  1194. image = Image.open(screenshot_stream).convert('L')
  1195. # 使用Tesseract OCR引擎识别图像中的文本
  1196. content = pytesseract.image_to_string(image, lang='chi_sim+eng')
  1197. except Exception as e:
  1198. try:
  1199. print("识别中文失败,尝试只识别英文")
  1200. print("Failed to recognize Chinese, try to recognize English only")
  1201. screenshot = element.screenshot_as_png
  1202. screenshot_stream = io.BytesIO(screenshot)
  1203. # 使用Pillow库打开截图,并转换为灰度图像
  1204. image = Image.open(screenshot_stream).convert('L')
  1205. # 使用Tesseract OCR引擎识别图像中的文本
  1206. content = pytesseract.image_to_string(image, lang='eng')
  1207. except Exception as e:
  1208. content = "OCR Error"
  1209. print(e)
  1210. if sys.platform == "win32":
  1211. print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501")
  1212. print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
  1213. elif sys.platform == "darwin":
  1214. print(
  1215. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  1216. elif sys.platform == "linux":
  1217. print(
  1218. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  1219. else:
  1220. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501")
  1221. print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
  1222. print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  1223. elif p["contentType"] == 9:
  1224. content = self.execute_code(
  1225. 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
  1226. elif p["contentType"] == 12: # 系统命令返回值
  1227. content = self.execute_code(1, p["JS"], p["JSWaitTime"])
  1228. elif p["contentType"] == 10: # 下拉框选中的值
  1229. try:
  1230. select_element = Select(element)
  1231. content = select_element.first_selected_option.get_attribute(
  1232. "value")
  1233. except:
  1234. content = ""
  1235. elif p["contentType"] == 11: # 下拉框选中的文本
  1236. try:
  1237. select_element = Select(element)
  1238. content = select_element.first_selected_option.text
  1239. except:
  1240. content = ""
  1241. return content
  1242. def clearOutputParameters(self):
  1243. for key in self.outputParameters:
  1244. self.outputParameters[key] = ""
  1245. # 提取数据事件
  1246. def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
  1247. if para["clear"] == 1:
  1248. self.clearOutputParameters()
  1249. try:
  1250. pageHTML = etree.HTML(self.browser.page_source)
  1251. except:
  1252. pageHTML = etree.HTML("")
  1253. if loopElement != "": # 只在数据在循环中提取时才需要获取循环元素
  1254. try:
  1255. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  1256. except:
  1257. try: # 循环点击每个链接如果没有新标签页打开,loopElement会丢失,此时需要重新获取
  1258. elements = self.browser.find_elements(
  1259. By.XPATH, parentPath, iframe=para["paras"][0]["iframe"])
  1260. loopElement = elements[index]
  1261. loopElementOuterHTML = loopElement.get_attribute(
  1262. 'outerHTML')
  1263. except:
  1264. loopElementOuterHTML = ""
  1265. else:
  1266. loopElementOuterHTML = ""
  1267. loopElementHTML = etree.HTML(loopElementOuterHTML)
  1268. for p in para["paras"]:
  1269. if p["optimizable"]:
  1270. try:
  1271. # 只有当前环境不变变化才可以快速提取数据
  1272. if self.browser.iframe_env != p["iframe"]:
  1273. p["optimizable"] = False
  1274. continue
  1275. # p["relativeXPath"] = p["relativeXPath"].lower()
  1276. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1277. # 已经有text()或@href了,不需要再加
  1278. content_type = ""
  1279. if p["relativeXPath"].find("/@href") >= 0 or p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
  1280. content_type = ""
  1281. elif p["nodeType"] == 2:
  1282. content_type = "/@href"
  1283. elif p["contentType"] == 1:
  1284. content_type = "/text()"
  1285. elif p["contentType"] == 0:
  1286. content_type = "//text()"
  1287. xpath = p["relativeXPath"] + content_type
  1288. if p["relative"]:
  1289. # if p["relativeXPath"] == "":
  1290. # content = [loopElementHTML]
  1291. # else:
  1292. # 如果字串里有//即子孙查找,则不动语句
  1293. if p["relativeXPath"].find("//") >= 0:
  1294. if xpath.startswith("/"):
  1295. full_path = "(" + parentPath + ")" + \
  1296. "[" + str(index + 1) + "]"+ \
  1297. p["relativeXPath"] + content_type
  1298. else: # 如果是id()这种形式,不需要包parentPath
  1299. full_path = xpath
  1300. try:
  1301. content = pageHTML.xpath(full_path)
  1302. except:
  1303. content = []
  1304. elif not p["relativeXPath"].startswith("/"): # 如果是id()这种形式,不需要包/html/body
  1305. try:
  1306. content = loopElementHTML.xpath(xpath)
  1307. except:
  1308. content = []
  1309. else:
  1310. content = loopElementHTML.xpath(
  1311. "/html/body/" + loopElementHTML[0][0].tag + xpath)
  1312. else:
  1313. if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
  1314. xpath = "/html/body" + xpath
  1315. content = pageHTML.xpath(xpath)
  1316. if len(content) > 0:
  1317. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  1318. # 拼接所有文本内容并去掉两边的空白
  1319. content = ' '.join(result.strip()
  1320. for result in content if result.strip())
  1321. if p["nodeType"] == 2:
  1322. base_url = self.browser.current_url
  1323. content = urljoin(base_url, content) # 合并链接相对路径为绝对路径
  1324. else:
  1325. content = p["default"]
  1326. if not self.dataNotFoundKeys[p["name"]]:
  1327. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1328. p["relativeXPath"], p["name"]))
  1329. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1330. p["name"], p["relativeXPath"]))
  1331. self.dataNotFoundKeys[p["name"]] = True
  1332. self.recordLog(
  1333. 'Element %s not found, use default' % p["relativeXPath"])
  1334. except Exception as e:
  1335. if not self.dataNotFoundKeys[p["name"]]:
  1336. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1337. p["relativeXPath"], p["name"]))
  1338. print("提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
  1339. p["name"], p["relativeXPath"]))
  1340. self.dataNotFoundKeys[p["name"]] = True
  1341. self.recordLog(
  1342. 'Element %s not found, use default' % p["relativeXPath"])
  1343. self.outputParameters[p["name"]] = content
  1344. # 对于不能优化的操作,使用selenium执行
  1345. for p in para["paras"]:
  1346. if not p["optimizable"]:
  1347. content = ""
  1348. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  1349. try:
  1350. # p["relativeXPath"] = p["relativeXPath"].lower()
  1351. # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
  1352. if p["relative"]: # 是否相对xpath
  1353. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1354. element = loopElement
  1355. else:
  1356. # 如果字串里有//即子孙查找,则不动语句
  1357. if p["relativeXPath"].find("//") >= 0:
  1358. # full_path = "(" + parentPath + \
  1359. # p["relativeXPath"] + ")" + \
  1360. # "[" + str(index + 1) + "]"
  1361. full_path = "(" + parentPath + ")" + \
  1362. "[" + str(index + 1) + "]" + \
  1363. p["relativeXPath"]
  1364. element = self.browser.find_element(
  1365. By.XPATH, full_path, iframe=p["iframe"])
  1366. else:
  1367. element = loopElement.find_element(By.XPATH,
  1368. p["relativeXPath"][1:])
  1369. else:
  1370. element = self.browser.find_element(
  1371. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1372. except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
  1373. # print(p)
  1374. try:
  1375. content = p["default"]
  1376. except Exception as e:
  1377. content = ""
  1378. self.outputParameters[p["name"]] = content
  1379. try:
  1380. if not self.dataNotFoundKeys[p["name"]]:
  1381. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1382. p["relativeXPath"], p["name"]))
  1383. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1384. p["name"], p["relativeXPath"]))
  1385. self.dataNotFoundKeys[p["name"]] = True
  1386. self.recordLog(
  1387. 'Element %s not found, use default' % p["relativeXPath"])
  1388. except:
  1389. pass
  1390. continue
  1391. except TimeoutException: # 超时的时候设置超时值
  1392. self.Log('Time out after set seconds when getting data')
  1393. self.recordLog(
  1394. 'Time out after set seconds when getting data')
  1395. try:
  1396. self.browser.execute_script('window.stop()')
  1397. except:
  1398. pass
  1399. if p["relative"]: # 是否相对xpath
  1400. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1401. element = loopElement
  1402. else:
  1403. element = loopElement.find_element(By.XPATH,
  1404. p["relativeXPath"][1:])
  1405. else:
  1406. element = self.browser.find_element(
  1407. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1408. # rt.end()
  1409. else:
  1410. element = self.browser.find_element(
  1411. By.XPATH, "//body", iframe=p["iframe"])
  1412. try:
  1413. self.execute_code(
  1414. 2, p["beforeJS"], p["beforeJSWaitTime"], element, iframe=p["iframe"]) # 执行前置js
  1415. content = self.get_content(p, element)
  1416. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  1417. self.recordLog(
  1418. 'StaleElementReferenceException: '+p["relativeXPath"])
  1419. time.sleep(3)
  1420. try:
  1421. if p["relative"]: # 是否相对xpath
  1422. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1423. element = loopElement
  1424. self.recordLog(
  1425. 'StaleElementReferenceException: loopElement')
  1426. else:
  1427. element = loopElement.find_element(By.XPATH,
  1428. p["relativeXPath"][1:])
  1429. self.recordLog(
  1430. 'StaleElementReferenceException: loopElement+relativeXPath')
  1431. else:
  1432. element = self.browser.find_element(
  1433. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1434. self.recordLog(
  1435. 'StaleElementReferenceException: relativeXPath')
  1436. content = self.get_content(p, element)
  1437. except StaleElementReferenceException:
  1438. self.recordLog(
  1439. 'StaleElementReferenceException: '+p["relativeXPath"])
  1440. continue # 再出现类似问题直接跳过
  1441. self.outputParameters[p["name"]] = content
  1442. self.execute_code(
  1443. 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
  1444. if para["recordASField"] > 0:
  1445. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  1446. self.OUTPUT.append(line)
  1447. # rt.end()
  1448. if __name__ == '__main__':
  1449. # from multiprocessing import freeze_support
  1450. # freeze_support() # 防止无限死循环多开
  1451. config = {
  1452. "id": [0],
  1453. "saved_file_name": "",
  1454. "user_data": False,
  1455. "config_folder": "",
  1456. "config_file_name": "config.json",
  1457. "read_type": "remote",
  1458. "headless": False,
  1459. "server_address": "http://localhost:8074",
  1460. "version": "0.3.6",
  1461. }
  1462. c = Config(config)
  1463. print(c)
  1464. options = Options()
  1465. driver_path = "chromedriver.exe"
  1466. import platform
  1467. print(sys.platform, platform.architecture())
  1468. option = webdriver.ChromeOptions()
  1469. if not os.path.exists(os.getcwd()+"/Data"):
  1470. os.mkdir(os.getcwd()+"/Data")
  1471. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  1472. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1473. # MacOS需要用option而不是options!
  1474. option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1475. option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1476. options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1477. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  1478. # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1479. # # MacOS需要用option而不是options!
  1480. # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1481. # driver_path = os.getcwd()+ "/chromedriver_mac64"
  1482. print(driver_path)
  1483. if c.config_folder == "":
  1484. c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
  1485. # print("Config folder for MacOS:", c.config_folder)
  1486. elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
  1487. print("Finding chromedriver in EasySpider",
  1488. os.getcwd()+"/EasySpider")
  1489. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  1490. options.binary_location = os.path.join(
  1491. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1492. option.binary_location = os.path.join(
  1493. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1494. driver_path = os.path.join(
  1495. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  1496. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1497. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1498. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  1499. options.binary_location = os.path.join(
  1500. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1501. option.binary_location = os.path.join(
  1502. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1503. driver_path = os.path.join(
  1504. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  1505. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1506. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1507. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  1508. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1509. option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1510. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  1511. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1512. options.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1513. else:
  1514. print("Unsupported platform")
  1515. sys.exit()
  1516. print("Chrome location:", options.binary_location)
  1517. print("Chromedriver location:", driver_path)
  1518. # elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
  1519. # print("Finding chromedriver in ./Chrome",
  1520. # os.getcwd()+"/Chrome")
  1521. # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
  1522. # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
  1523. # driver_path = "./Chrome/chromedriver.exe"
  1524. elif os.path.exists(os.getcwd()+"/../ElectronJS"):
  1525. # 软件dev用
  1526. print("Finding chromedriver in EasySpider",
  1527. os.getcwd()+"/ElectronJS")
  1528. option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1529. options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1530. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  1531. option.add_extension("../ElectronJS/XPathHelper.crx")
  1532. else:
  1533. options.binary_location = "./chrome.exe" # 指定chrome位置
  1534. driver_path = "./chromedriver.exe"
  1535. option.add_extension("XPathHelper.crx")
  1536. option.add_experimental_option(
  1537. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  1538. # user_data_dir = r'' # 注意没有Default!
  1539. # options.add_argument('--user-data-dir='+p)
  1540. # 总结:
  1541. # 0. 带Cookie需要用userdatadir
  1542. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  1543. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  1544. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  1545. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  1546. try:
  1547. with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
  1548. config = json.load(f)
  1549. print("Config file path: " + c.config_folder + c.config_file_name)
  1550. absolute_user_data_folder = config["absolute_user_data_folder"]
  1551. print("\nAbsolute_user_data_folder:",
  1552. absolute_user_data_folder, "\n")
  1553. except:
  1554. pass
  1555. if c.user_data:
  1556. option.add_argument(
  1557. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1558. option.add_argument("--profile-directory=Default")
  1559. options.add_argument(
  1560. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1561. options.add_argument("--profile-directory=Default")
  1562. if c.headless:
  1563. print("Headless mode")
  1564. print("无头模式")
  1565. option.add_argument("--headless")
  1566. options.add_argument("--headless")
  1567. # options.add_argument(
  1568. # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
  1569. option.add_argument(
  1570. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1571. options.add_argument(
  1572. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1573. threads = []
  1574. for i in c.id:
  1575. # print(options)
  1576. print("id: ", i)
  1577. if c.read_type == "remote":
  1578. print("remote")
  1579. content = requests.get(
  1580. c.server_address + "/queryExecutionInstance?id=" + str(i))
  1581. service = json.loads(content.text) # 加载服务信息
  1582. else:
  1583. print("local")
  1584. with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f:
  1585. content = f.read()
  1586. service = json.loads(content) # 加载服务信息
  1587. print("Task Name:", service["name"])
  1588. print("任务名称:", service["name"])
  1589. try:
  1590. cloudflare = service["cloudflare"]
  1591. except:
  1592. cloudflare = 0
  1593. if cloudflare == 0:
  1594. options.add_argument('log-level=3') # 隐藏日志
  1595. option.add_argument('log-level=3') # 隐藏日志
  1596. options.add_experimental_option("prefs", {
  1597. # 设置文件下载路径
  1598. "download.default_directory": "Data/Task_" + str(i),
  1599. "download.prompt_for_download": False, # 禁止下载提示框
  1600. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1601. "download.directory_upgrade": True,
  1602. "download.extensions_to_open": "applications/pdf",
  1603. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1604. })
  1605. option.add_experimental_option("prefs", {
  1606. # 设置文件下载路径
  1607. "download.default_directory": "Data/Task_" + str(i),
  1608. "download.prompt_for_download": False, # 禁止下载提示框
  1609. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1610. "download.directory_upgrade": True,
  1611. "download.extensions_to_open": "applications/pdf",
  1612. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1613. })
  1614. try:
  1615. if service["environment"] == 1:
  1616. option.add_experimental_option(
  1617. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1618. options.add_experimental_option(
  1619. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1620. except:
  1621. pass
  1622. browser_t = MyChrome(
  1623. options=options, chrome_options=option, executable_path=driver_path)
  1624. elif cloudflare == 1:
  1625. if sys.platform == "win32":
  1626. options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
  1627. # options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
  1628. browser_t = MyUCChrome(
  1629. options=options, driver_executable_path=driver_path)
  1630. else:
  1631. print("Cloudflare模式只支持Windows x64平台。")
  1632. print("Cloudflare Mode only support on Windows x64 platform.")
  1633. sys.exit()
  1634. event = Event()
  1635. event.set()
  1636. thread = BrowserThread(browser_t, i, service,
  1637. c.version, event, c.saved_file_name, config=config)
  1638. print("Thread with task id: ", i, " is created")
  1639. threads.append(thread)
  1640. thread.start()
  1641. # Set the pause operation
  1642. # if sys.platform != "linux":
  1643. # time.sleep(3)
  1644. # print("\n\n----------------------------------")
  1645. # print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1646. # print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1647. # print("----------------------------------\n\n")
  1648. # Thread(target=check_pause, args=("p", event)).start()
  1649. # else:
  1650. time.sleep(3)
  1651. press_time = {"duration": 0, "is_pressed": False}
  1652. print("\n\n----------------------------------")
  1653. print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1654. print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1655. print("----------------------------------\n\n")
  1656. # if cloudflare:
  1657. # print("过Cloudflare验证模式有时候会不稳定,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
  1658. # print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
  1659. # 使用监听器监听键盘输入
  1660. try:
  1661. with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
  1662. listener.join()
  1663. except:
  1664. pass
  1665. # print("您的操作系统不支持暂停功能。")
  1666. # print("Your operating system does not support the pause function.")
  1667. # print("线程长度:", len(threads) )
  1668. for thread in threads:
  1669. print()
  1670. thread.join()
  1671. for thread in threads:
  1672. thread.browser.quit()
  1673. # print("Thread with task id: ", thread.id, " is closed")
  1674. print("程序已运行完成,请手动关闭此窗口。")
  1675. print("The program has finished running, please manually close this window.")