easyspider_executestage.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501
  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. from datetime import datetime
  4. import io # 遇到错误退出时应执行的代码
  5. import json
  6. # from lib2to3.pgen2 import driver
  7. import re
  8. # import shutil
  9. import subprocess
  10. import sys
  11. # from urllib import parse
  12. # import base64
  13. # import hashlib
  14. import time
  15. # import keyboard
  16. import requests
  17. from lxml import etree
  18. from selenium.webdriver.chrome.options import Options
  19. from selenium.webdriver.common.keys import Keys
  20. from selenium.webdriver.common.action_chains import ActionChains
  21. from selenium import webdriver
  22. from selenium.webdriver.support.ui import WebDriverWait
  23. from selenium.webdriver.support import expected_conditions as EC
  24. from selenium.webdriver.common.by import By
  25. from selenium.common.exceptions import NoSuchElementException
  26. from selenium.common.exceptions import TimeoutException
  27. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  28. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  29. from selenium.webdriver.support.ui import Select
  30. from selenium.webdriver import ActionChains
  31. from selenium.webdriver.common.by import By
  32. import undetected_chromedriver as uc
  33. import random
  34. # import pandas as pd
  35. from openpyxl import load_workbook, Workbook
  36. # import numpy
  37. import csv
  38. import os
  39. from commandline_config import Config
  40. import pytesseract
  41. from PIL import Image
  42. # import uuid
  43. from threading import Thread, Event
  44. from myChrome import MyChrome, MyUCChrome
  45. from utils import check_pause, download_image, get_output_code, isnull, myMySQL, new_line, write_to_csv, write_to_excel
  46. desired_capabilities = DesiredCapabilities.CHROME
  47. desired_capabilities["pageLoadStrategy"] = "none"
  48. class BrowserThread(Thread):
  49. def __init__(self, browser_t, id, service, version, event, saveName, config):
  50. Thread.__init__(self)
  51. self.browser = browser_t
  52. self.config = config
  53. self.id = id
  54. self.event = event
  55. try:
  56. self.saveName = service["saveName"] # 保存文件的名字
  57. except:
  58. now = datetime.now()
  59. # 将时间格式化为精确到秒的字符串
  60. self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
  61. self.log = ""
  62. self.OUTPUT = ""
  63. self.SAVED = False
  64. self.BREAK = False
  65. # 名称设定
  66. if saveName != "": # 命令行覆盖保存名称
  67. self.saveName = saveName # 保存文件的名字
  68. now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  69. self.saveName = self.saveName.replace("current_time", now)
  70. print("Save Name for task ID", i, "is:", self.saveName)
  71. print("任务ID", i, "的保存文件名为:", self.saveName)
  72. if not os.path.exists("Data/Task_" + str(i)):
  73. os.mkdir("Data/Task_" + str(i))
  74. if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
  75. os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
  76. stealth_path = driver_path[:driver_path.find(
  77. "chromedriver")] + "stealth.min.js"
  78. with open(stealth_path, 'r') as f:
  79. js = f.read()
  80. print("Loading stealth.min.js")
  81. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  82. 'source': js}) # TMALL 反扒
  83. WebDriverWait(self.browser, 10)
  84. self.browser.get('about:blank')
  85. self.procedure = service["graph"] # 程序执行流程
  86. try:
  87. self.maxViewLength = service["maxViewLength"] # 最大显示长度
  88. except:
  89. self.maxViewLength = 15
  90. try:
  91. self.outputFormat = service["outputFormat"] # 输出格式
  92. except:
  93. self.outputFormat = "csv"
  94. try:
  95. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  96. pass
  97. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  98. if service["version"] != version:
  99. print("版本不一致,请使用" +
  100. service["version"] + "版本的EasySpider运行该任务!")
  101. print("Version not match, please use EasySpider " +
  102. service["version"] + " to run this task!")
  103. self.browser.quit()
  104. sys.exit()
  105. except: # 0.2.0版本没有version字段,所以直接退出
  106. print("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  107. print("Version not match, please use EasySpider v0.2.0 to run this task!")
  108. self.browser.quit()
  109. sys.exit()
  110. try:
  111. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  112. except:
  113. self.save_threshold = 10
  114. self.links = list(
  115. filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
  116. self.OUTPUT = [] # 采集的数据
  117. self.writeMode = 1 # 写入模式,0为新建,1为追加
  118. if self.outputFormat == "csv" or self.outputFormat == "txt":
  119. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
  120. self.OUTPUT.append([]) # 添加表头
  121. self.writeMode = 0
  122. elif self.outputFormat == "xlsx":
  123. if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
  124. self.OUTPUT.append([]) # 添加表头
  125. self.writeMode = 0
  126. elif self.outputFormat == "mysql":
  127. self.mysql = myMySQL(config["mysql_config_path"])
  128. self.mysql.create_table(self.saveName, service["outputParameters"])
  129. self.writeMode = 2
  130. if self.writeMode == 1:
  131. print("追加模式")
  132. print("Append Mode")
  133. elif self.writeMode == 0:
  134. print("新建模式")
  135. print("New Mode")
  136. elif self.writeMode == 2:
  137. print("MySQL模式")
  138. print("MySQL Mode")
  139. self.containJudge = service["containJudge"] # 是否含有判断语句
  140. self.outputParameters = {}
  141. self.outputParametersTypes = []
  142. self.outputParametersRecord = [] # 字段是否被记录
  143. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  144. self.log = "" # 记下现在总共开了多少个标签页
  145. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  146. self.SAVED = False # 记录是否已经存储了
  147. for para in service["outputParameters"]: # 初始化输出参数
  148. if para["name"] not in self.outputParameters.keys():
  149. self.outputParameters[para["name"]] = ""
  150. self.dataNotFoundKeys[para["name"]] = False
  151. try:
  152. self.outputParametersTypes.append(para["type"])
  153. except:
  154. self.outputParametersTypes.append("text")
  155. try:
  156. self.outputParametersRecord.append(bool(para["recordASField"]))
  157. except:
  158. self.outputParametersRecord.append(True)
  159. # 文件叠加的时候不添加表头
  160. if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
  161. if self.writeMode == 0:
  162. self.OUTPUT[0].append(para["name"])
  163. self.urlId = 0 # 全局记录变量
  164. self.preprocess() # 预处理,优化提取数据流程
  165. # 检测如果没有复杂的操作,优化提取数据流程
  166. def preprocess(self):
  167. for node in self.procedure:
  168. try:
  169. iframe = node["parameters"]["iframe"]
  170. except:
  171. node["parameters"]["iframe"] = False
  172. if node["option"] == 1: # 打开网页操作
  173. try:
  174. cookies = node["parameters"]["cookies"]
  175. except:
  176. node["parameters"]["cookies"] = ""
  177. if node["option"] == 3: # 提取数据操作
  178. paras = node["parameters"]["paras"]
  179. for para in paras:
  180. try:
  181. iframe = para["iframe"]
  182. except:
  183. para["iframe"] = False
  184. if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
  185. para["optimizable"] = True
  186. else:
  187. para["optimizable"] = False
  188. def run(self):
  189. # 挨个执行程序
  190. for i in range(len(self.links)):
  191. print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
  192. print("Executing link", i + 1, "/ ", len(self.links))
  193. self.executeNode(0)
  194. self.urlId = self.urlId + 1
  195. files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  196. # 如果目录为空,则删除该目录
  197. if not files:
  198. os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
  199. print("Done!")
  200. print("执行完成!")
  201. self.recordLog("Done!")
  202. self.saveData(exit=True)
  203. if self.outputFormat == "mysql":
  204. self.mysql.close()
  205. def recordLog(self, str=""):
  206. self.log = self.log + str + "\n"
  207. # 控制台打印log函数
  208. def Log(self, text, text2=""):
  209. switch = False
  210. if switch:
  211. print(text, text2)
  212. # @atexit.register
  213. # def clean(self):
  214. # self.saveData(exit=True)
  215. # self.browser.quit()
  216. # sys.exit(0)
  217. def saveData(self, exit=False):
  218. # 每save_threshold条保存一次
  219. if exit == True or len(self.OUTPUT) >= self.save_threshold:
  220. # 写入日志
  221. with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
  222. file_obj.write(self.log)
  223. file_obj.close()
  224. # 写入数据
  225. if self.outputFormat == "csv" or self.outputFormat == "txt":
  226. file_name = "Data/Task_" + \
  227. str(self.id) + "/" + self.saveName + '.' + self.outputFormat
  228. write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
  229. elif self.outputFormat == "xlsx":
  230. file_name = "Data/Task_" + \
  231. str(self.id) + "/" + self.saveName + '.xlsx'
  232. write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
  233. elif self.outputFormat == "mysql":
  234. self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
  235. self.OUTPUT = []
  236. self.log = ""
  237. def scrollDown(self, para, rt=""):
  238. try:
  239. time.sleep(para["scrollWaitTime"]) # 下拉前等待
  240. except:
  241. pass
  242. scrollType = int(para["scrollType"])
  243. try:
  244. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  245. for i in range(para["scrollCount"]):
  246. self.Log("Wait for set second after screen scrolling")
  247. body = self.browser.find_element(
  248. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  249. if scrollType == 1:
  250. body.send_keys(Keys.PAGE_DOWN)
  251. elif scrollType == 2:
  252. body.send_keys(Keys.END)
  253. try:
  254. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  255. except:
  256. pass
  257. except:
  258. self.Log('Time out after set seconds when scrolling. ')
  259. self.recordLog('Time out after set seconds when scrolling')
  260. self.browser.execute_script('window.stop()')
  261. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  262. for i in range(para["scrollCount"]):
  263. self.Log("Wait for set second after screen scrolling")
  264. body = self.browser.find_element(
  265. By.CSS_SELECTOR, "body", iframe=para["iframe"])
  266. if scrollType == 1:
  267. body.send_keys(Keys.PGDN)
  268. elif scrollType == 2:
  269. body.send_keys(Keys.END)
  270. try:
  271. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  272. except:
  273. pass
  274. if rt != "":
  275. rt.end()
  276. def execute_code(self, codeMode, code, max_wait_time, element=None, iframe=False):
  277. output = ""
  278. if code == "":
  279. return ""
  280. if max_wait_time == 0:
  281. max_wait_time = 999999
  282. # print(codeMode, code)
  283. # 将value中的Field[""]替换为outputParameters中的键值
  284. pattern = r'Field\["([^"]+)"\]'
  285. try:
  286. replaced_text = re.sub(
  287. pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
  288. except:
  289. replaced_text = code
  290. code = replaced_text
  291. if iframe and self.browser.iframe_env == False:
  292. # 获取所有的 iframe
  293. self.browser.switch_to.default_content()
  294. iframes = self.browser.find_elements(
  295. By.CSS_SELECTOR, "iframe", iframe=False)
  296. # 遍历所有的 iframe 并点击里面的元素
  297. for iframe in iframes:
  298. # 切换到 iframe
  299. try:
  300. self.browser.switch_to.default_content()
  301. self.browser.switch_to.frame(iframe)
  302. self.browser.iframe_env = True
  303. break
  304. except:
  305. print("Iframe switch failed")
  306. elif not iframe and self.browser.iframe_env == True:
  307. self.browser.switch_to.default_content()
  308. self.browser.iframe_env = False
  309. if int(codeMode) == 0:
  310. self.recordLog("Execute JavaScript:" + code)
  311. self.recordLog("执行JavaScript:" + code)
  312. self.browser.set_script_timeout(max_wait_time)
  313. try:
  314. output = self.browser.execute_script(code)
  315. except:
  316. output = ""
  317. self.recordLog("JavaScript execution failed")
  318. elif int(codeMode) == 2:
  319. self.recordLog("Execute JavaScript for element:" + code)
  320. self.recordLog("对元素执行JavaScript:" + code)
  321. self.browser.set_script_timeout(max_wait_time)
  322. try:
  323. output = self.browser.execute_script(code, element)
  324. except:
  325. output = ""
  326. self.recordLog("JavaScript execution failed")
  327. elif int(codeMode) == 1:
  328. self.recordLog("Execute System Call:" + code)
  329. self.recordLog("执行系统命令:" + code)
  330. # 执行系统命令
  331. try:
  332. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  333. output = subprocess.run(
  334. code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  335. # 输出命令返回值
  336. output = output.stdout
  337. print(output)
  338. except subprocess.TimeoutExpired:
  339. # 命令执行时间超过指定值,抛出异常
  340. self.recordLog("Command timed out")
  341. self.recordLog("命令执行超时")
  342. except Exception as e:
  343. print(e) # 打印异常信息
  344. self.recordLog("Command execution failed")
  345. self.recordLog("命令执行失败")
  346. return str(output)
  347. def customOperation(self, node, loopValue, loopPath, index):
  348. paras = node["parameters"]
  349. codeMode = int(paras["codeMode"])
  350. code = paras["code"]
  351. output = ""
  352. max_wait_time = int(paras["waitTime"])
  353. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  354. try:
  355. elements = self.browser.find_elements(
  356. By.XPATH, loopPath, iframe=paras["iframe"])
  357. element = elements[index]
  358. output = self.execute_code(
  359. codeMode, code, max_wait_time, element, iframe=paras["iframe"])
  360. except:
  361. output = ""
  362. print("JavaScript execution failed")
  363. elif codeMode == 3:
  364. self.BREAK = True
  365. else: # 0 1
  366. output = self.execute_code(
  367. codeMode, code, max_wait_time, iframe=paras["iframe"])
  368. recordASField = bool(paras["recordASField"])
  369. if recordASField:
  370. print("操作<" + node["title"] + ">的返回值为:" + output)
  371. print("The return value of operation <" + node["title"] + "> is: " + output)
  372. self.outputParameters[node["title"]] = output
  373. if recordASField:
  374. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  375. self.OUTPUT.append(line)
  376. def switchSelect(self, para, loopValue):
  377. optionMode = int(para["optionMode"])
  378. optionValue = para["optionValue"]
  379. try:
  380. dropdown = Select(self.browser.find_element(
  381. By.XPATH, para["xpath"], iframe=para["iframe"]))
  382. try:
  383. if optionMode == 0:
  384. # 获取当前选中的选项索引
  385. current_index = dropdown.options.index(
  386. dropdown.first_selected_option)
  387. # 计算下一个选项的索引
  388. next_index = (current_index + 1) % len(dropdown.options)
  389. # 选择下一个选项
  390. dropdown.select_by_index(next_index)
  391. elif optionMode == 1:
  392. dropdown.select_by_index(int(optionValue))
  393. elif optionMode == 2:
  394. dropdown.select_by_value(optionValue)
  395. elif optionMode == 3:
  396. dropdown.select_by_visible_text(optionValue)
  397. except:
  398. print("切换下拉框选项失败:", para["xpath"],
  399. para["optionMode"], para["optionValue"])
  400. print("Failed to change drop-down box option:",
  401. para["xpath"], para["optionMode"], para["optionValue"])
  402. except:
  403. print("找不到下拉框元素:", para["xpath"])
  404. print("Cannot find drop-down box element:", para["xpath"])
  405. def moveToElement(self, para, loopElement=None, loopPath="", index=0):
  406. time.sleep(0.1) # 移动之前等待0.1秒
  407. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  408. path = loopPath
  409. # element = loopElement
  410. else:
  411. index = 0
  412. path = para["xpath"] # 不然使用元素定义的xpath
  413. # element = self.browser.find_element(
  414. # By.XPATH, path, iframe=para["iframe"])
  415. try:
  416. elements = self.browser.find_elements(
  417. By.XPATH, path, iframe=para["iframe"])
  418. element = elements[index]
  419. try:
  420. ActionChains(self.browser).move_to_element(element).perform()
  421. except:
  422. print("移动鼠标到元素失败:", para["xpath"])
  423. print("Failed to move mouse to element:", para["xpath"])
  424. except:
  425. print("找不到元素:", para["xpath"])
  426. print("Cannot find element:", para["xpath"])
  427. # 执行节点关键函数部分
  428. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  429. node = self.procedure[nodeId]
  430. WebDriverWait(self.browser, 10).until
  431. # 等待元素出现才进行操作,10秒内未出现则报错
  432. (EC.visibility_of_element_located(
  433. (By.XPATH, node["parameters"]["xpath"])))
  434. # 根据不同选项执行不同操作
  435. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  436. for i in node["sequence"]: # 从根节点开始向下读取
  437. self.executeNode(i, loopValue, loopPath, index)
  438. elif node["option"] == 1: # 打开网页操作
  439. self.recordLog("openPage")
  440. self.openPage(node["parameters"], loopValue)
  441. elif node["option"] == 2: # 点击元素
  442. self.recordLog("Click")
  443. self.clickElement(node["parameters"], loopValue, loopPath, index)
  444. elif node["option"] == 3: # 提取数据
  445. self.recordLog("getData")
  446. self.getData(node["parameters"], loopValue, node["isInLoop"],
  447. parentPath=loopPath, index=index)
  448. self.saveData()
  449. elif node["option"] == 4: # 输入文字
  450. self.inputInfo(node["parameters"], loopValue)
  451. elif node["option"] == 5: # 自定义操作
  452. self.customOperation(node, loopValue, loopPath, index)
  453. self.saveData()
  454. elif node["option"] == 6: # 切换下拉框
  455. self.switchSelect(node["parameters"], loopValue)
  456. elif node["option"] == 7: # 鼠标移动到元素上
  457. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  458. elif node["option"] == 8: # 循环
  459. self.recordLog("loop")
  460. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  461. elif node["option"] == 9: # 条件分支
  462. self.recordLog("judge")
  463. self.judgeExecute(node, loopValue, loopPath, index)
  464. # 执行完之后进行等待
  465. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  466. waitTime = 0.01 # 默认等待0.01秒
  467. if node["parameters"]["wait"] >= 0:
  468. waitTime = node["parameters"]["wait"]
  469. try:
  470. waitType = int(node["parameters"]["waitType"])
  471. except:
  472. waitType = 0
  473. if waitType == 0: # 固定等待时间
  474. time.sleep(waitTime)
  475. elif waitType == 1: # 随机等待时间
  476. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  477. self.Log("Wait seconds after node executing: ", waitTime)
  478. self.event.wait() # 等待事件结束
  479. # 对判断条件的处理
  480. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  481. executeBranchId = 0 # 要执行的BranchId
  482. for i in node["sequence"]:
  483. cnode = self.procedure[i] # 获得条件分支
  484. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  485. if tType == 0: # 什么条件都没有
  486. executeBranchId = i
  487. break
  488. elif tType == 1: # 当前页面包含文本
  489. try:
  490. bodyText = self.browser.find_element(
  491. By.CSS_SELECTOR, "body", iframe=cnode["parameters"]["iframe"]).text
  492. if bodyText.find(cnode["parameters"]["value"]) >= 0:
  493. executeBranchId = i
  494. break
  495. except: # 找不到元素下一个条件
  496. continue
  497. elif tType == 2: # 当前页面包含元素
  498. try:
  499. if self.browser.find_element(By.XPATH, cnode["parameters"]["value"], iframe=cnode["parameters"]["iframe"]):
  500. executeBranchId = i
  501. break
  502. except: # 找不到元素或者xpath写错了,下一个条件
  503. continue
  504. elif tType == 3: # 当前循环元素包括文本
  505. try:
  506. if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
  507. executeBranchId = i
  508. break
  509. except: # 找不到元素或者xpath写错了,下一个条件
  510. continue
  511. elif tType == 4: # 当前循环元素包括元素
  512. try:
  513. if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
  514. executeBranchId = i
  515. break
  516. except: # 找不到元素或者xpath写错了,下一个条件
  517. continue
  518. elif tType <= 7: # JS命令返回值
  519. if tType == 5: # JS命令返回值等于
  520. output = self.execute_code(
  521. 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  522. elif tType == 6: # System
  523. output = self.execute_code(
  524. 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"])
  525. elif tType == 7: # 针对当前循环项的JS命令返回值
  526. output = self.execute_code(
  527. 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"])
  528. try:
  529. if output.find("rue") != -1: # 如果返回值中包含true
  530. code = 1
  531. else:
  532. code = int(output)
  533. except:
  534. code = 0
  535. if code > 0:
  536. executeBranchId = i
  537. break
  538. # rt.end()
  539. if executeBranchId != 0:
  540. self.executeNode(executeBranchId, loopElement, clickPath, index)
  541. # 对循环的处理
  542. def loopExecute(self, node, loopValue, clickPath="", index=0):
  543. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  544. # self.Log("循环执行前等待0.1秒")
  545. self.Log("Wait 0.1 second before loop")
  546. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  547. thisHistoryLength = self.browser.execute_script(
  548. 'return history.length') # 记录本次循环内的history的length
  549. self.history["index"] = thisHistoryLength
  550. self.history["handle"] = thisHandle
  551. if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  552. # 无跳转标签页操作
  553. count = 0 # 执行次数
  554. while True: # do while循环
  555. try:
  556. finished = False
  557. element = self.browser.find_element(
  558. By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  559. for i in node["sequence"]: # 挨个执行操作
  560. self.executeNode(
  561. i, element, node["parameters"]["xpath"], 0)
  562. if self.BREAK: # 如果有break操作,下面的操作不执行
  563. break
  564. if self.BREAK: # 如果有break操作,退出循环
  565. self.BREAK = False
  566. finished = True
  567. break
  568. finished = True
  569. self.Log("Click: ", node["parameters"]["xpath"])
  570. self.recordLog("Click:" + node["parameters"]["xpath"])
  571. except NoSuchElementException:
  572. # except:
  573. print("Single loop element not found: ",
  574. node["parameters"]["xpath"])
  575. print("找不到要循环的单个元素: ", node["parameters"]["xpath"])
  576. self.recordLog(
  577. "Single loop element not found: " + node["parameters"]["xpath"])
  578. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  579. if node["option"] != 2:
  580. self.executeNode(
  581. i, None, node["parameters"]["xpath"], 0)
  582. finished = True
  583. break # 如果找不到元素,退出循环
  584. finally:
  585. if not finished:
  586. print("\n\n-------Retrying-------\n\n")
  587. self.Log("-------Retrying-------: ",
  588. node["parameters"]["xpath"])
  589. self.recordLog("ClickNotFound:" +
  590. node["parameters"]["xpath"])
  591. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  592. if node["option"] != 2:
  593. self.executeNode(
  594. i, None, node["parameters"]["xpath"], 0)
  595. break # 如果找不到元素,退出循环
  596. count = count + 1
  597. self.Log("Page: ", count)
  598. self.recordLog("Page:" + str(count))
  599. # print(node["parameters"]["exitCount"], "-------")
  600. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  601. break
  602. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  603. output = self.execute_code(int(
  604. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  605. code = get_output_code(output)
  606. if code <= 0:
  607. break
  608. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  609. try:
  610. elements = self.browser.find_elements(By.XPATH,
  611. node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
  612. if len(elements) == 0:
  613. print("Loop element not found: ",
  614. node["parameters"]["xpath"])
  615. print("找不到循环元素: ", node["parameters"]["xpath"])
  616. self.recordLog("pathNotFound: " +
  617. node["parameters"]["xpath"])
  618. for index in range(len(elements)):
  619. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  620. self.executeNode(i, elements[index],
  621. node["parameters"]["xpath"], index)
  622. if self.BREAK:
  623. break
  624. if self.BREAK:
  625. self.BREAK = False
  626. break
  627. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  628. while True: # 一直关闭窗口直到当前标签页
  629. self.browser.close() # 关闭使用完的标签页
  630. self.browser.switch_to.window(
  631. self.browser.window_handles[-1])
  632. if self.browser.current_window_handle == thisHandle:
  633. break
  634. if self.history["index"] != thisHistoryLength and self.history[
  635. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  636. difference = thisHistoryLength - \
  637. self.history["index"] # 计算历史记录变化差值
  638. self.browser.execute_script(
  639. 'history.go(' + str(difference) + ')') # 回退历史记录
  640. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  641. time.sleep(node["parameters"]["historyWait"])
  642. # else:
  643. # time.sleep(2)
  644. # 切换历史记录等待:
  645. self.Log("Change history back time or:",
  646. node["parameters"]["historyWait"])
  647. self.browser.execute_script('window.stop()')
  648. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  649. output = self.execute_code(int(
  650. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  651. code = get_output_code(output)
  652. if code <= 0:
  653. break
  654. except NoSuchElementException:
  655. print("Loop element not found: ", node["parameters"]["xpath"])
  656. print("找不到循环元素: ", node["parameters"]["xpath"])
  657. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  658. except Exception as e:
  659. raise
  660. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  661. # 千万不要忘了分割!!
  662. for path in node["parameters"]["pathList"].split("\n"):
  663. try:
  664. element = self.browser.find_element(
  665. By.XPATH, path, iframe=node["parameters"]["iframe"])
  666. for i in node["sequence"]: # 挨个执行操作
  667. self.executeNode(i, element, path, 0)
  668. if self.BREAK:
  669. break
  670. if self.BREAK:
  671. self.BREAK = False
  672. break
  673. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  674. while True: # 一直关闭窗口直到当前标签页
  675. self.browser.close() # 关闭使用完的标签页
  676. self.browser.switch_to.window(
  677. self.browser.window_handles[-1])
  678. if self.browser.current_window_handle == thisHandle:
  679. break
  680. if self.history["index"] != thisHistoryLength and self.history[
  681. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  682. difference = thisHistoryLength - \
  683. self.history["index"] # 计算历史记录变化差值
  684. self.browser.execute_script(
  685. 'history.go(' + str(difference) + ')') # 回退历史记录
  686. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  687. time.sleep(node["parameters"]["historyWait"])
  688. # else:
  689. # time.sleep(2)
  690. self.Log("Change history back time or:",
  691. node["parameters"]["historyWait"])
  692. self.browser.execute_script('window.stop()')
  693. except NoSuchElementException:
  694. print("Loop element not found: ", path)
  695. print("找不到循环元素: ", path)
  696. self.recordLog("pathNotFound: " + path)
  697. continue # 循环中找不到元素就略过操作
  698. except Exception as e:
  699. raise
  700. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  701. output = self.execute_code(int(
  702. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  703. code = get_output_code(output)
  704. if code <= 0:
  705. break
  706. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  707. textList = node["parameters"]["textList"].split("\n")
  708. for text in textList:
  709. self.recordLog("input: " + text)
  710. for i in node["sequence"]: # 挨个执行操作
  711. self.executeNode(i, text, "", 0)
  712. if self.BREAK:
  713. break
  714. if self.BREAK:
  715. self.BREAK = False
  716. break
  717. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  718. output = self.execute_code(int(
  719. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  720. code = get_output_code(output)
  721. if code <= 0:
  722. break
  723. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  724. # tempList = node["parameters"]["textList"].split("\r\n")
  725. urlList = list(
  726. filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
  727. # urlList = []
  728. # for url in tempList:
  729. # if url != "":
  730. # urlList.append(url)
  731. for url in urlList:
  732. self.recordLog("input: " + url)
  733. for i in node["sequence"]:
  734. self.executeNode(i, url, "", 0)
  735. if self.BREAK:
  736. break
  737. if self.BREAK:
  738. self.BREAK = False
  739. break
  740. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  741. output = self.execute_code(int(
  742. node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
  743. code = get_output_code(output)
  744. if code <= 0:
  745. break
  746. elif int(node["parameters"]["loopType"]) <= 6: # 命令返回值
  747. while True: # do while循环
  748. if int(node["parameters"]["loopType"]) == 5: # JS
  749. output = self.execute_code(
  750. 0, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  751. elif int(node["parameters"]["loopType"]) == 6: # System
  752. output = self.execute_code(
  753. 1, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"])
  754. code = get_output_code(output)
  755. if code <= 0:
  756. break
  757. for i in node["sequence"]: # 挨个执行操作
  758. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  759. if self.BREAK:
  760. break
  761. if self.BREAK:
  762. self.BREAK = False
  763. break
  764. self.history["index"] = thisHistoryLength
  765. self.history["handle"] = self.browser.current_window_handle
  766. self.scrollDown(node["parameters"])
  767. # 打开网页事件
  768. def openPage(self, para, loopValue):
  769. time.sleep(1) # 打开网页后强行等待至少1秒
  770. if len(self.browser.window_handles) > 1:
  771. self.browser.switch_to.window(
  772. self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  773. self.browser.close()
  774. self.browser.switch_to.window(
  775. self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  776. self.history["handle"] = self.browser.current_window_handle
  777. if para["useLoop"]:
  778. url = loopValue
  779. elif para["url"] != "about:blank":
  780. url = self.links[self.urlId]
  781. # clear output parameters
  782. for key in self.outputParameters:
  783. self.outputParameters[key] = ""
  784. else:
  785. url = list(filter(isnull, para["links"].split("\n")))[0]
  786. # 将value中的Field[""]替换为outputParameters中的键值
  787. pattern = r'Field\["([^"]+)"\]'
  788. try:
  789. replaced_text = re.sub(
  790. pattern, lambda match: self.outputParameters.get(match.group(1), ''), url)
  791. except:
  792. replaced_text = url
  793. url = replaced_text
  794. try:
  795. maxWaitTime = int(para["maxWaitTime"])
  796. except:
  797. maxWaitTime = 10 # 默认最大等待时间为10秒
  798. try:
  799. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  800. self.browser.set_script_timeout(maxWaitTime)
  801. self.browser.get(url)
  802. if para["cookies"] != "":
  803. self.browser.delete_all_cookies() # 清除所有已有cookie
  804. cookies = para["cookies"].split('\n')
  805. for cookie in cookies:
  806. name, value = cookie.split('=', 1)
  807. cookie_dict = {'name': name, 'value': value}
  808. # 加载 cookie
  809. self.browser.add_cookie(cookie_dict)
  810. self.Log('Loading page: ' + url)
  811. self.recordLog('Loading page: ' + url)
  812. except TimeoutException:
  813. self.Log('Time out after set seconds when loading page: ' + url)
  814. self.recordLog(
  815. 'Time out after set seconds when loading page: ' + url)
  816. try:
  817. self.browser.execute_script('window.stop()')
  818. except:
  819. pass
  820. except Exception as e:
  821. print("Failed to load page: " + url)
  822. self.recordLog('Failed to load page: ' + url)
  823. try:
  824. self.history["index"] = self.browser.execute_script(
  825. "return history.length")
  826. except TimeoutException:
  827. try:
  828. self.browser.execute_script('window.stop()')
  829. self.history["index"] = self.browser.execute_script(
  830. "return history.length")
  831. except:
  832. self.history["index"] = 0
  833. self.scrollDown(para) # 控制屏幕向下滚动
  834. # 键盘输入事件
  835. def inputInfo(self, para, loopValue):
  836. time.sleep(0.1) # 输入之前等待0.1秒
  837. self.Log("Wait 0.1 second before input")
  838. try:
  839. textbox = self.browser.find_element(
  840. By.XPATH, para["xpath"], iframe=para["iframe"])
  841. # textbox.send_keys(Keys.CONTROL, 'a')
  842. # textbox.send_keys(Keys.BACKSPACE)
  843. self.execute_code(
  844. 2, para["beforeJS"], para["beforeJSWaitTime"], textbox, iframe=para["iframe"]) # 执行前置JS
  845. # Send the HOME key
  846. textbox.send_keys(Keys.HOME)
  847. # Send the SHIFT + END key combination
  848. textbox.send_keys(Keys.SHIFT, Keys.END)
  849. # Send the DELETE key
  850. textbox.send_keys(Keys.DELETE)
  851. value = ""
  852. if para["useLoop"]:
  853. value = loopValue
  854. else:
  855. value = para["value"]
  856. # 将value中的Field[""]替换为outputParameters中的键值
  857. pattern = r'Field\["([^"]+)"\]'
  858. try:
  859. replaced_text = re.sub(
  860. pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  861. replaced_text = re.sub(
  862. '<enter>', '', replaced_text, flags=re.IGNORECASE)
  863. except:
  864. replaced_text = value
  865. textbox.send_keys(replaced_text)
  866. if value.lower().find("<enter>") >= 0:
  867. textbox.send_keys(Keys.ENTER)
  868. self.execute_code(
  869. 2, para["afterJS"], para["afterJSWaitTime"], textbox, iframe=para["iframe"]) # 执行后置js
  870. except:
  871. print("Cannot find input box element:" +
  872. para["xpath"] + ", please try to set the wait time before executing this operation")
  873. print("找不到输入框元素:" + para["xpath"] + ",请尝试在执行此操作前设置等待时间")
  874. self.recordLog("Cannot find input box element:" +
  875. para["xpath"] + "Please try to set the wait time before executing this operation")
  876. # 点击元素事件
  877. def clickElement(self, para, loopElement=None, clickPath="", index=0):
  878. try:
  879. maxWaitTime = int(para["maxWaitTime"])
  880. except:
  881. maxWaitTime = 10
  882. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  883. self.browser.set_script_timeout(maxWaitTime)
  884. # 点击前对该元素执行一段JavaScript代码
  885. try:
  886. # element = self.browser.find_element(
  887. # By.XPATH, path, iframe=para["iframe"])
  888. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  889. path = clickPath
  890. # element = loopElement
  891. else:
  892. index = 0
  893. path = para["xpath"] # 不然使用元素定义的xpath
  894. # element = self.browser.find_element(
  895. # By.XPATH, path, iframe=para["iframe"])
  896. elements = self.browser.find_elements(
  897. By.XPATH, path, iframe=para["iframe"])
  898. element = elements[index]
  899. if para["beforeJS"] != "":
  900. self.execute_code(2, para["beforeJS"],
  901. para["beforeJSWaitTime"], element, iframe=para["iframe"])
  902. except:
  903. print("Cannot find element:" +
  904. path + ", please try to set the wait time before executing this operation")
  905. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  906. self.recordLog("Cannot find element:" +
  907. path + ", please try to set the wait time before executing this operation")
  908. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  909. try:
  910. click_way = int(para["clickWay"])
  911. except:
  912. click_way = 0
  913. try:
  914. if click_way == 0: # 用selenium的点击方法
  915. actions = ActionChains(self.browser) # 实例化一个action对象
  916. actions.click(element).perform()
  917. elif click_way == 1: # 用js的点击方法
  918. script = 'var result = document.evaluate(`' + path + \
  919. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  920. self.browser.execute_script(script, str(index)) # 用js的点击方法
  921. except TimeoutException:
  922. self.Log('Time out after set seconds when loading clicked page')
  923. self.recordLog(
  924. 'Time out after set seconds when loading clicked page')
  925. try:
  926. self.browser.execute_script('window.stop()')
  927. except:
  928. pass
  929. except Exception as e:
  930. self.Log(e)
  931. self.recordLog(str(e))
  932. # 点击后对该元素执行一段JavaScript代码
  933. try:
  934. if para["afterJS"] != "":
  935. element = self.browser.find_element(
  936. By.XPATH, path, iframe=para["iframe"])
  937. self.execute_code(2, para["afterJS"],
  938. para["afterJSWaitTime"], element, iframe=para["iframe"])
  939. except:
  940. print("Cannot find element:" + path)
  941. self.recordLog("Cannot find element:" +
  942. path + ", please try to set the wait time before executing this operation")
  943. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  944. waitTime = float(para["wait"]) + 0.01 # 点击之后等待
  945. try:
  946. waitType = int(para["waitType"])
  947. except:
  948. waitType = 0
  949. if waitType == 0: # 固定等待时间
  950. time.sleep(waitTime)
  951. elif waitType == 1: # 随机等待时间
  952. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  953. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  954. self.browser.switch_to.window(
  955. self.browser.window_handles[-1]) # 跳转到新的标签页
  956. self.history["handle"] = self.browser.current_window_handle
  957. try:
  958. self.history["index"] = self.browser.execute_script(
  959. "return history.length")
  960. except TimeoutException:
  961. self.browser.execute_script('window.stop()')
  962. self.history["index"] = self.browser.execute_script(
  963. "return history.length")
  964. else:
  965. try:
  966. self.history["index"] = self.browser.execute_script(
  967. "return history.length")
  968. except TimeoutException:
  969. self.browser.execute_script('window.stop()')
  970. self.history["index"] = self.browser.execute_script(
  971. "return history.length")
  972. # 如果打开了新窗口,切换到新窗口
  973. self.scrollDown(para) # 根据参数配置向下滚动
  974. # rt.end()
  975. def get_content(self, p, element):
  976. content = ""
  977. if p["contentType"] == 0:
  978. # 先处理特殊节点类型
  979. if p["nodeType"] == 2:
  980. if element.get_attribute("href") != None:
  981. content = element.get_attribute("href")
  982. else:
  983. content = ""
  984. elif p["nodeType"] == 3:
  985. if element.get_attribute("value") != None:
  986. content = element.get_attribute("value")
  987. else:
  988. content = ""
  989. elif p["nodeType"] == 4: # 图片
  990. if element.get_attribute("src") != None:
  991. content = element.get_attribute("src")
  992. else:
  993. content = ""
  994. try:
  995. downloadPic = p["downloadPic"]
  996. except:
  997. downloadPic = 0
  998. if downloadPic == 1:
  999. download_image(content, "Data/Task_" +
  1000. str(self.id) + "/" + self.saveName + "/")
  1001. else: # 普通节点
  1002. content = element.text
  1003. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  1004. if p["nodeType"] == 2:
  1005. if element.get_attribute("href") != None:
  1006. content = element.get_attribute("href")
  1007. else:
  1008. content = ""
  1009. elif p["nodeType"] == 3:
  1010. if element.get_attribute("value") != None:
  1011. content = element.get_attribute("value")
  1012. else:
  1013. content = ""
  1014. elif p["nodeType"] == 4: # 图片
  1015. if element.get_attribute("src") != None:
  1016. content = element.get_attribute("src")
  1017. else:
  1018. content = ""
  1019. try:
  1020. downloadPic = p["downloadPic"]
  1021. except:
  1022. downloadPic = 0
  1023. if downloadPic == 1:
  1024. download_image(content, "Data/Task_" +
  1025. str(self.id) + "/" + self.saveName + "/")
  1026. else:
  1027. command = 'var arr = [];\
  1028. var content = arguments[0];\
  1029. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  1030. if(content.childNodes[i].nodeType === 3){ \
  1031. arr.push(content.childNodes[i].nodeValue);\
  1032. }\
  1033. }\
  1034. var str = arr.join(" "); \
  1035. return str;'
  1036. content = self.browser.execute_script(command, element).replace(
  1037. "\n", "").replace("\\s+", " ")
  1038. elif p["contentType"] == 2:
  1039. content = element.get_attribute('innerHTML')
  1040. elif p["contentType"] == 3:
  1041. content = element.get_attribute('outerHTML')
  1042. elif p["contentType"] == 4:
  1043. # 获取元素的背景图片地址
  1044. bg_url = element.value_of_css_property('background-image')
  1045. # 清除背景图片地址中的多余字符
  1046. bg_url = bg_url.replace('url("', '').replace('")', '')
  1047. content = bg_url
  1048. elif p["contentType"] == 5:
  1049. content = self.browser.current_url
  1050. elif p["contentType"] == 6:
  1051. content = self.browser.title
  1052. elif p["contentType"] == 7:
  1053. # 获取整个网页的高度和宽度
  1054. height = self.browser.execute_script(
  1055. "return document.body.scrollHeight")
  1056. width = self.browser.execute_script(
  1057. "return document.body.scrollWidth")
  1058. # 调整浏览器窗口的大小
  1059. self.browser.set_window_size(width, height)
  1060. element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
  1061. "/" + str(time.time()) + ".png")
  1062. elif p["contentType"] == 8:
  1063. try:
  1064. screenshot = element.screenshot_as_png
  1065. screenshot_stream = io.BytesIO(screenshot)
  1066. # 使用Pillow库打开截图,并转换为灰度图像
  1067. image = Image.open(screenshot_stream).convert('L')
  1068. # 使用Tesseract OCR引擎识别图像中的文本
  1069. text = pytesseract.image_to_string(image, lang='chi_sim+eng')
  1070. content = text
  1071. except Exception as e:
  1072. content = "OCR Error"
  1073. print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  1074. if sys.platform == "win32":
  1075. print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1076. elif sys.platform == "darwin":
  1077. print(e)
  1078. print(
  1079. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  1080. elif sys.platform == "linux":
  1081. print(e)
  1082. print(
  1083. "注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  1084. else:
  1085. print(e)
  1086. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  1087. elif p["contentType"] == 9:
  1088. content = self.execute_code(
  1089. 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
  1090. elif p["contentType"] == 12: # 系统命令返回值
  1091. content = self.execute_code(1, p["JS"], p["JSWaitTime"])
  1092. elif p["contentType"] == 10: # 下拉框选中的值
  1093. try:
  1094. select_element = Select(element)
  1095. content = select_element.first_selected_option.get_attribute(
  1096. "value")
  1097. except:
  1098. content = ""
  1099. elif p["contentType"] == 11: # 下拉框选中的文本
  1100. try:
  1101. select_element = Select(element)
  1102. content = select_element.first_selected_option.text
  1103. except:
  1104. content = ""
  1105. return content
  1106. # 提取数据事件
  1107. def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
  1108. pageHTML = etree.HTML(self.browser.page_source)
  1109. if loopElement != "": # 只在数据在循环中提取时才需要获取循环元素
  1110. try:
  1111. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  1112. except:
  1113. try: # 循环点击每个链接如果没有新标签页打开,loopElement会丢失,此时需要重新获取
  1114. elements = self.browser.find_elements(
  1115. By.XPATH, parentPath, iframe=para["paras"][0]["iframe"])
  1116. loopElement = elements[index]
  1117. loopElementOuterHTML = loopElement.get_attribute(
  1118. 'outerHTML')
  1119. except:
  1120. loopElementOuterHTML = ""
  1121. else:
  1122. loopElementOuterHTML = ""
  1123. loopElementHTML = etree.HTML(loopElementOuterHTML)
  1124. for p in para["paras"]:
  1125. if p["optimizable"]:
  1126. try:
  1127. # 只有当前环境不变变化才可以快速提取数据
  1128. if self.browser.iframe_env != p["iframe"]:
  1129. p["optimizable"] = False
  1130. continue
  1131. p["relativeXPath"] = p["relativeXPath"].lower()
  1132. if p["nodeType"] == 2:
  1133. xpath = p["relativeXPath"] + "/@href"
  1134. elif p["contentType"] == 1:
  1135. xpath = p["relativeXPath"] + "/text()"
  1136. elif p["contentType"] == 0:
  1137. xpath = p["relativeXPath"] + "//text()"
  1138. if p["relative"]:
  1139. # if p["relativeXPath"] == "":
  1140. # content = [loopElementHTML]
  1141. # else:
  1142. # 如果字串里有//即子孙查找,则不动语句
  1143. if p["relativeXPath"].find("//") >= 0:
  1144. full_path = "(" + parentPath + \
  1145. xpath + ")" + \
  1146. "[" + str(index + 1) + "]"
  1147. content = pageHTML.xpath(full_path)
  1148. else:
  1149. content = loopElementHTML.xpath(
  1150. "/html/body/" + loopElementHTML[0][0].tag + xpath)
  1151. else:
  1152. if xpath.find("/body") < 0:
  1153. xpath = "/html/body" + xpath
  1154. content = pageHTML.xpath(xpath)
  1155. if len(content) > 0:
  1156. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  1157. # 拼接所有文本内容并去掉两边的空白
  1158. content = ' '.join(result.strip()
  1159. for result in content if result.strip())
  1160. else:
  1161. content = p["default"]
  1162. if not self.dataNotFoundKeys[p["name"]]:
  1163. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1164. p["relativeXPath"], p["name"]))
  1165. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1166. p["name"], p["relativeXPath"]))
  1167. self.dataNotFoundKeys[p["name"]] = True
  1168. self.recordLog(
  1169. 'Element %s not found, use default' % p["relativeXPath"])
  1170. except Exception as e:
  1171. if not self.dataNotFoundKeys[p["name"]]:
  1172. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1173. p["relativeXPath"], p["name"]))
  1174. print("提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
  1175. p["name"], p["relativeXPath"]))
  1176. self.dataNotFoundKeys[p["name"]] = True
  1177. self.recordLog(
  1178. 'Element %s not found, use default' % p["relativeXPath"])
  1179. self.outputParameters[p["name"]] = content
  1180. # 对于不能优化的操作,使用selenium执行
  1181. for p in para["paras"]:
  1182. if not p["optimizable"]:
  1183. content = ""
  1184. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  1185. try:
  1186. p["relativeXPath"] = p["relativeXPath"].lower()
  1187. if p["relative"]: # 是否相对xpath
  1188. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1189. element = loopElement
  1190. else:
  1191. # 如果字串里有//即子孙查找,则不动语句
  1192. if p["relativeXPath"].find("//") >= 0:
  1193. full_path = "(" + parentPath + \
  1194. p["relativeXPath"] + ")" + \
  1195. "[" + str(index + 1) + "]"
  1196. element = self.browser.find_element(
  1197. By.XPATH, full_path, iframe=p["iframe"])
  1198. else:
  1199. element = loopElement.find_element(By.XPATH,
  1200. p["relativeXPath"][1:])
  1201. else:
  1202. element = self.browser.find_element(
  1203. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1204. except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
  1205. # print(p)
  1206. try:
  1207. content = p["default"]
  1208. except Exception as e:
  1209. content = ""
  1210. self.outputParameters[p["name"]] = content
  1211. try:
  1212. if not self.dataNotFoundKeys[p["name"]]:
  1213. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
  1214. p["relativeXPath"], p["name"]))
  1215. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
  1216. p["name"], p["relativeXPath"]))
  1217. self.dataNotFoundKeys[p["name"]] = True
  1218. self.recordLog(
  1219. 'Element %s not found, use default' % p["relativeXPath"])
  1220. except:
  1221. pass
  1222. continue
  1223. except TimeoutException: # 超时的时候设置超时值
  1224. self.Log('Time out after set seconds when getting data')
  1225. self.recordLog(
  1226. 'Time out after set seconds when getting data')
  1227. self.browser.execute_script('window.stop()')
  1228. if p["relative"]: # 是否相对xpath
  1229. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1230. element = loopElement
  1231. else:
  1232. element = loopElement.find_element(By.XPATH,
  1233. p["relativeXPath"][1:])
  1234. else:
  1235. element = self.browser.find_element(
  1236. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1237. # rt.end()
  1238. else:
  1239. element = self.browser.find_element(
  1240. By.XPATH, "//body", iframe=p["iframe"])
  1241. try:
  1242. self.execute_code(
  1243. 2, p["beforeJS"], p["beforeJSWaitTime"], element, iframe=p["iframe"]) # 执行前置js
  1244. content = self.get_content(p, element)
  1245. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  1246. self.recordLog(
  1247. 'StaleElementReferenceException: '+p["relativeXPath"])
  1248. time.sleep(3)
  1249. try:
  1250. if p["relative"]: # 是否相对xpath
  1251. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1252. element = loopElement
  1253. self.recordLog(
  1254. 'StaleElementReferenceException: loopElement')
  1255. else:
  1256. element = loopElement.find_element(By.XPATH,
  1257. p["relativeXPath"][1:])
  1258. self.recordLog(
  1259. 'StaleElementReferenceException: loopElement+relativeXPath')
  1260. else:
  1261. element = self.browser.find_element(
  1262. By.XPATH, p["relativeXPath"], iframe=p["iframe"])
  1263. self.recordLog(
  1264. 'StaleElementReferenceException: relativeXPath')
  1265. content = self.get_content(p, element)
  1266. except StaleElementReferenceException:
  1267. self.recordLog(
  1268. 'StaleElementReferenceException: '+p["relativeXPath"])
  1269. continue # 再出现类似问题直接跳过
  1270. self.outputParameters[p["name"]] = content
  1271. self.execute_code(
  1272. 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
  1273. line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
  1274. self.OUTPUT.append(line)
  1275. # rt.end()
  1276. if __name__ == '__main__':
  1277. config = {
  1278. "id": [0],
  1279. "saved_file_name": "",
  1280. "user_data": False,
  1281. "config_folder": "",
  1282. "config_file_name": "config.json",
  1283. "read_type": "remote",
  1284. "headless": False,
  1285. "server_address": "http://localhost:8074",
  1286. "version": "0.3.5",
  1287. }
  1288. c = Config(config)
  1289. print(c)
  1290. options = Options()
  1291. driver_path = "chromedriver.exe"
  1292. import platform
  1293. print(sys.platform, platform.architecture())
  1294. option = webdriver.ChromeOptions()
  1295. if not os.path.exists(os.getcwd()+"/Data"):
  1296. os.mkdir(os.getcwd()+"/Data")
  1297. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  1298. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1299. # MacOS需要用option而不是options!
  1300. option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1301. option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1302. options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
  1303. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  1304. # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1305. # # MacOS需要用option而不是options!
  1306. # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1307. # driver_path = os.getcwd()+ "/chromedriver_mac64"
  1308. print(driver_path)
  1309. elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
  1310. print("Finding chromedriver in EasySpider",
  1311. os.getcwd()+"/EasySpider")
  1312. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  1313. options.binary_location = os.path.join(
  1314. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1315. driver_path = os.path.join(
  1316. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  1317. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1318. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  1319. options.binary_location = os.path.join(
  1320. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1321. driver_path = os.path.join(
  1322. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  1323. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1324. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  1325. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1326. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  1327. option.add_extension("EasySpider/resources/app/XPathHelper.crx")
  1328. else:
  1329. print("Unsupported platform")
  1330. sys.exit()
  1331. print("Chrome location:", options.binary_location)
  1332. print("Chromedriver location:", driver_path)
  1333. # elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
  1334. # print("Finding chromedriver in ./Chrome",
  1335. # os.getcwd()+"/Chrome")
  1336. # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
  1337. # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
  1338. # driver_path = "./Chrome/chromedriver.exe"
  1339. elif os.path.exists(os.getcwd()+"/../ElectronJS"):
  1340. # 软件dev用
  1341. print("Finding chromedriver in EasySpider",
  1342. os.getcwd()+"/ElectronJS")
  1343. option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1344. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  1345. option.add_extension("../ElectronJS/XPathHelper.crx")
  1346. else:
  1347. options.binary_location = "./chrome.exe" # 指定chrome位置
  1348. driver_path = "./chromedriver.exe"
  1349. option.add_extension("XPathHelper.crx")
  1350. option.add_experimental_option(
  1351. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  1352. options.add_argument('-ignore-certificate-errors')
  1353. options.add_argument('-ignore -ssl-errors')
  1354. option.add_argument('-ignore-certificate-errors')
  1355. option.add_argument('-ignore -ssl-errors')
  1356. # user_data_dir = r'' # 注意没有Default!
  1357. # options.add_argument('--user-data-dir='+p)
  1358. # 总结:
  1359. # 0. 带Cookie需要用userdatadir
  1360. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  1361. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  1362. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  1363. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  1364. try:
  1365. with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
  1366. config = json.load(f)
  1367. absolute_user_data_folder = config["absolute_user_data_folder"]
  1368. print("\nAbsolute_user_data_folder:",
  1369. absolute_user_data_folder, "\n")
  1370. except:
  1371. pass
  1372. if c.user_data:
  1373. option.add_argument(
  1374. f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1375. option.add_argument("--profile-directory=Default")
  1376. if c.headless:
  1377. print("Headless mode")
  1378. print("无头模式")
  1379. option.add_argument("--headless")
  1380. options.add_argument("--headless")
  1381. # options.add_argument(
  1382. # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
  1383. option.add_argument(
  1384. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1385. options.add_argument(
  1386. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1387. threads = []
  1388. for i in c.id:
  1389. print(options)
  1390. print("id: ", i)
  1391. if c.read_type == "remote":
  1392. print("remote")
  1393. content = requests.get(
  1394. c.server_address + "/queryExecutionInstance?id=" + str(i))
  1395. service = json.loads(content.text) # 加载服务信息
  1396. else:
  1397. print("local")
  1398. with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f:
  1399. content = f.read()
  1400. service = json.loads(content) # 加载服务信息
  1401. print("Task Name:", service["name"])
  1402. print("任务名称:", service["name"])
  1403. try:
  1404. cloudflare = service["cloudflare"]
  1405. except:
  1406. cloudflare = 0
  1407. if cloudflare == 0:
  1408. options.add_experimental_option("prefs", {
  1409. # 设置文件下载路径
  1410. "download.default_directory": "Data/Task_" + str(i),
  1411. "download.prompt_for_download": False, # 禁止下载提示框
  1412. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1413. "download.directory_upgrade": True,
  1414. "download.extensions_to_open": "applications/pdf",
  1415. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1416. })
  1417. option.add_experimental_option("prefs", {
  1418. # 设置文件下载路径
  1419. "download.default_directory": "Data/Task_" + str(i),
  1420. "download.prompt_for_download": False, # 禁止下载提示框
  1421. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1422. "download.directory_upgrade": True,
  1423. "download.extensions_to_open": "applications/pdf",
  1424. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1425. })
  1426. try:
  1427. if service["environment"] == 1:
  1428. option.add_experimental_option(
  1429. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1430. options.add_experimental_option(
  1431. 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1432. except:
  1433. pass
  1434. browser_t = MyChrome(
  1435. options=options, chrome_options=option, executable_path=driver_path)
  1436. elif cloudflare == 1:
  1437. browser_t = MyUCChrome(
  1438. options=options, chrome_options=option, executable_path=driver_path)
  1439. print("Pass Cloudflare Mode")
  1440. print("过Cloudflare验证模式")
  1441. event = Event()
  1442. event.set()
  1443. thread = BrowserThread(browser_t, i, service,
  1444. c.version, event, c.saved_file_name, config=config)
  1445. print("Thread with task id: ", i, " is created")
  1446. threads.append(thread)
  1447. thread.start()
  1448. Thread(target=check_pause, args=("p", event)).start()
  1449. time.sleep(5)
  1450. print("\n\n----------------------------------")
  1451. print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1452. print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1453. print("----------------------------------\n\n")
  1454. for thread in threads:
  1455. thread.join()
  1456. for thread in threads:
  1457. thread.browser.quit()
  1458. # print("Thread with task id: ", thread.id, " is closed")
  1459. print("程序已运行完成,请手动关闭此窗口。")
  1460. print("The program has finished running, please manually close this window.")