easyspider_executestage.py 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316
  1. # -*- coding: utf-8 -*-
  2. # import atexit
  3. import io # 遇到错误退出时应执行的代码
  4. import json
  5. # from lib2to3.pgen2 import driver
  6. import re
  7. # import shutil
  8. import subprocess
  9. import sys
  10. # from urllib import parse
  11. # import base64
  12. # import hashlib
  13. import time
  14. import keyboard
  15. import requests
  16. from lxml import etree
  17. from selenium.webdriver.chrome.options import Options
  18. from selenium.webdriver.common.keys import Keys
  19. from selenium.webdriver.common.action_chains import ActionChains
  20. from selenium import webdriver
  21. from selenium.webdriver.support.ui import WebDriverWait
  22. from selenium.webdriver.support import expected_conditions as EC
  23. from selenium.webdriver.common.by import By
  24. from selenium.common.exceptions import NoSuchElementException
  25. from selenium.common.exceptions import TimeoutException
  26. from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
  27. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  28. from selenium.webdriver.support.ui import Select
  29. from selenium.webdriver import ActionChains
  30. import undetected_chromedriver as uc
  31. import random
  32. # import numpy
  33. import csv
  34. import os
  35. from selenium.webdriver.common.by import By
  36. from commandline_config import Config
  37. import pytesseract
  38. from PIL import Image
  39. import uuid
  40. from threading import Thread, Event
  41. desired_capabilities = DesiredCapabilities.CHROME
  42. desired_capabilities["pageLoadStrategy"] = "none"
  43. # 控制流程的暂停和继续
  44. def check_file(filename, event):
  45. while True:
  46. if keyboard.is_pressed('p'): # 按下p键,暂停程序
  47. if event._flag == False:
  48. print("任务执行中,长按p键暂停执行。")
  49. print("Task is running, long press 'p' to pause.")
  50. # 设置Event的值为True,使得线程b可以继续执行
  51. event.set()
  52. else:
  53. # 设置Event的值为False,使得线程b暂停执行
  54. print("任务已暂停,长按p键继续执行...")
  55. print("Task paused, press 'p' to continue...")
  56. event.clear()
  57. time.sleep(1) # 每秒检查一次
  58. def download_image(url, save_directory):
  59. # 定义浏览器头信息
  60. headers = {
  61. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  62. }
  63. # 发送 GET 请求获取图片数据
  64. response = requests.get(url, headers=headers)
  65. # 检查响应状态码是否为成功状态
  66. if response.status_code == requests.codes.ok:
  67. # 提取文件名
  68. file_name = url.split('/')[-1].split("?")[0]
  69. # 生成唯一的新文件名
  70. new_file_name = file_name + '_' + str(uuid.uuid4()) + '_' + file_name
  71. # 构建保存路径
  72. save_path = os.path.join(save_directory, new_file_name)
  73. # 保存图片到本地
  74. with open(save_path, 'wb') as file:
  75. file.write(response.content)
  76. print("图片已成功下载到:", save_path)
  77. print("The image has been successfully downloaded to:", save_path)
  78. else:
  79. print("下载图片失败,请检查此图片链接是否有效:", url)
  80. print("Failed to download image, please check if this image link is valid:", url)
  81. def get_output_code(output):
  82. try:
  83. if output.find("rue") != -1: # 如果返回值中包含true
  84. code = 1
  85. else:
  86. code = int(output)
  87. except:
  88. code = 0
  89. return code
  90. # 判断字段是否为空
  91. def isnull(s):
  92. return len(s) != 0
  93. class Time:
  94. def __init__(self, type1=""):
  95. self.t = int(round(time.time() * 1000))
  96. self.type = type1
  97. def end(self):
  98. at = int(round(time.time() * 1000))
  99. print("Time used for", self.type, ":", at - self.t, "ms")
  100. class BrowserThread(Thread):
  101. def __init__(self, browser_t, id, service, version, event):
  102. Thread.__init__(self)
  103. self.browser = browser_t
  104. self.id = id
  105. self.event = event
  106. self.saveName = saveName
  107. self.log = ""
  108. self.OUTPUT = ""
  109. self.SAVED = False
  110. stealth_path = driver_path[:driver_path.find("chromedriver")] + "stealth.min.js"
  111. with open(stealth_path, 'r') as f:
  112. js = f.read()
  113. print("Loading stealth.min.js")
  114. self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
  115. WebDriverWait(self.browser, 10)
  116. self.browser.get('about:blank')
  117. self.procedure = service["graph"] # 程序执行流程
  118. try:
  119. if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
  120. pass
  121. else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
  122. if service["version"] != version:
  123. print("版本不一致,请使用" + service["version"] + "版本的EasySpider运行该任务!")
  124. print("Version not match, please use EasySpider " + service["version"] + " to run this task!")
  125. self.browser.quit()
  126. sys.exit()
  127. except: # 0.2.0版本没有version字段,所以直接退出
  128. print("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
  129. print("Version not match, please use EasySpider v0.2.0 to run this task!")
  130. self.browser.quit()
  131. sys.exit()
  132. try:
  133. self.save_threshold = service["saveThreshold"] # 保存最低阈值
  134. except:
  135. self.save_threshold = 10
  136. self.links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
  137. self.OUTPUT = [] # 采集的数据
  138. self.OUTPUT.append([]) # 添加表头
  139. self.containJudge = service["containJudge"] # 是否含有判断语句
  140. self.bodyText = "" # 记录bodyText
  141. tOut = service["outputParameters"] # 生成输出参数对象
  142. self.outputParameters = {}
  143. self.dataNotFoundKeys = {} # 记录没有找到数据的key
  144. self.log = "" # 记下现在总共开了多少个标签页
  145. self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
  146. self.SAVED = False # 记录是否已经存储了
  147. for para in tOut:
  148. if para["name"] not in self.outputParameters.keys():
  149. self.outputParameters[para["name"]] = ""
  150. self.dataNotFoundKeys[para["name"]] = False
  151. self.OUTPUT[0].append(para["name"])
  152. self.urlId = 0 # 全局记录变量
  153. self.preprocess() # 预处理,优化提取数据流程
  154. # 检测如果没有复杂的操作,优化提取数据流程
  155. def preprocess(self):
  156. for node in self.procedure:
  157. if node["option"] == 3: # 提取数据操作
  158. paras = node["parameters"]["paras"]
  159. for para in paras:
  160. if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
  161. para["optimizable"] = True
  162. else:
  163. para["optimizable"] = False
  164. def run(self):
  165. # 挨个执行程序
  166. for i in range(len(self.links)):
  167. self.executeNode(0)
  168. self.urlId = self.urlId + 1
  169. files = os.listdir("Data/" + self.saveName)
  170. # 如果目录为空,则删除该目录
  171. if not files:
  172. os.rmdir("Data/" + self.saveName)
  173. # os.remove("Data/" + self.saveName + "_control.txt")
  174. print("Done!")
  175. print("执行完成!")
  176. self.recordLog("Done!")
  177. self.saveData(exit=True)
  178. def recordLog(self, str=""):
  179. self.log = self.log + str + "\n"
  180. # 控制台打印log函数
  181. def Log(self, text, text2=""):
  182. switch = False
  183. if switch:
  184. print(text, text2)
  185. # @atexit.register
  186. # def clean(self):
  187. # self.saveData(exit=True)
  188. # self.browser.quit()
  189. # sys.exit(0)
  190. def saveData(self, exit=False):
  191. if exit == True or len(self.OUTPUT) >= self.save_threshold: # 每save_threshold条保存一次
  192. with open("Data/"+ self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
  193. file_obj.write(self.log)
  194. file_obj.close()
  195. with open("Data/"+ self.saveName + '.csv', 'a', encoding='utf-8-sig', newline="") as f:
  196. f_csv = csv.writer(f)
  197. for line in self.OUTPUT:
  198. f_csv.writerow(line)
  199. f.close()
  200. self.OUTPUT = []
  201. self.log = ""
  202. def scrollDown(self, para, rt=""):
  203. time.sleep(para["scrollWaitTime"]) # 下拉前等待
  204. scrollType = int(para["scrollType"])
  205. try:
  206. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  207. for i in range(para["scrollCount"]):
  208. self.Log("Wait for set second after screen scrolling")
  209. body = self.browser.find_element(By.CSS_SELECTOR, "body")
  210. if scrollType == 1:
  211. body.send_keys(Keys.PAGE_DOWN)
  212. elif scrollType == 2:
  213. body.send_keys(Keys.END)
  214. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  215. except:
  216. self.Log('time out after set seconds when scrolling. ')
  217. self.recordLog('time out after set seconds when scrolling')
  218. self.browser.execute_script('window.stop()')
  219. if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
  220. for i in range(para["scrollCount"]):
  221. self.Log("Wait for set second after screen scrolling")
  222. body = self.browser.find_element(By.CSS_SELECTOR, "body")
  223. if scrollType == 1:
  224. body.send_keys(Keys.PGDN)
  225. elif scrollType == 2:
  226. body.send_keys(Keys.END)
  227. time.sleep(para["scrollWaitTime"]) # 下拉完等待
  228. if rt != "":
  229. rt.end()
  230. def execute_code(self, codeMode, code, max_wait_time, element=None):
  231. output = ""
  232. if code == "":
  233. return ""
  234. if max_wait_time == 0:
  235. max_wait_time = 999999
  236. # print(codeMode, code)
  237. pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
  238. try:
  239. replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
  240. except:
  241. replaced_text = code
  242. code = replaced_text
  243. if int(codeMode) == 0:
  244. self.recordLog("Execute JavaScript:" + code)
  245. self.recordLog("执行JavaScript:" + code)
  246. self.browser.set_script_timeout(max_wait_time)
  247. try:
  248. output = self.browser.execute_script(code)
  249. except:
  250. output = ""
  251. self.recordLog("JavaScript execution failed")
  252. elif int(codeMode) == 2:
  253. self.recordLog("Execute JavaScript for element:" + code)
  254. self.recordLog("对元素执行JavaScript:" + code)
  255. self.browser.set_script_timeout(max_wait_time)
  256. try:
  257. output = self.browser.execute_script(code, element)
  258. except:
  259. output = ""
  260. self.recordLog("JavaScript execution failed")
  261. elif int(codeMode) == 1:
  262. self.recordLog("Execute System Call:" + code)
  263. self.recordLog("执行系统命令:" + code)
  264. # 执行系统命令
  265. try:
  266. # output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, encoding="utf-8", shell=True)
  267. output = subprocess.run(code, capture_output=True, text=True, timeout=max_wait_time, shell=True)
  268. # 输出命令返回值
  269. output = output.stdout
  270. print(output)
  271. except subprocess.TimeoutExpired:
  272. # 命令执行时间超过指定值,抛出异常
  273. self.recordLog("Command timed out")
  274. self.recordLog("命令执行超时")
  275. except Exception as e:
  276. print(e) # 打印异常信息
  277. self.recordLog("Command execution failed")
  278. self.recordLog("命令执行失败")
  279. return str(output)
  280. def customOperation(self, node, loopValue, loopPath, index):
  281. paras = node["parameters"]
  282. codeMode = int(paras["codeMode"])
  283. code = paras["code"]
  284. max_wait_time = int(paras["waitTime"])
  285. if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
  286. try:
  287. elements = self.browser.find_elements(By.XPATH, loopPath)
  288. element = elements[index]
  289. output = self.execute_code(codeMode, code, max_wait_time, element)
  290. except:
  291. output = ""
  292. print("JavaScript execution failed")
  293. else:
  294. output = self.execute_code(codeMode, code, max_wait_time)
  295. recordASField = int(paras["recordASField"])
  296. if recordASField:
  297. self.outputParameters[node["title"]] = output
  298. line = []
  299. for value in self.outputParameters.values():
  300. line.append(value)
  301. print(value[:15], " ", end="")
  302. print("")
  303. self.OUTPUT.append(line)
  304. def switchSelect(self, para, loopValue):
  305. optionMode = int(para["optionMode"])
  306. optionValue = para["optionValue"]
  307. try:
  308. dropdown = Select(self.browser.find_element(By.XPATH, para["xpath"]))
  309. try:
  310. if optionMode == 0:
  311. # 获取当前选中的选项索引
  312. current_index = dropdown.options.index(dropdown.first_selected_option)
  313. # 计算下一个选项的索引
  314. next_index = (current_index + 1) % len(dropdown.options)
  315. # 选择下一个选项
  316. dropdown.select_by_index(next_index)
  317. elif optionMode == 1:
  318. dropdown.select_by_index(int(optionValue))
  319. elif optionMode == 2:
  320. dropdown.select_by_value(optionValue)
  321. elif optionMode == 3:
  322. dropdown.select_by_visible_text(optionValue)
  323. except:
  324. print("切换下拉框选项失败:", para["xpath"], para["optionMode"], para["optionValue"])
  325. print("Failed to change drop-down box option:", para["xpath"], para["optionMode"], para["optionValue"])
  326. except:
  327. print("找不到下拉框元素:", para["xpath"])
  328. print("Cannot find drop-down box element:", para["xpath"])
  329. def moveToElement(self, para, loopElement=None, loopPath="", index=0):
  330. time.sleep(0.1) # 移动之前等待0.1秒
  331. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  332. path = loopPath
  333. else:
  334. index = 0
  335. path = para["xpath"] # 不然使用元素定义的xpath
  336. try:
  337. elements = self.browser.find_elements(By.XPATH, path)
  338. element = elements[index]
  339. try:
  340. ActionChains(self.browser).move_to_element(element).perform()
  341. except:
  342. print("移动鼠标到元素失败:", para["xpath"])
  343. print("Failed to move mouse to element:", para["xpath"])
  344. except:
  345. print("找不到元素:", para["xpath"])
  346. print("Cannot find element:", para["xpath"])
  347. # 执行节点关键函数部分
  348. def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
  349. node = self.procedure[nodeId]
  350. WebDriverWait(self.browser, 10).until
  351. # 等待元素出现才进行操作,10秒内未出现则报错
  352. (EC.visibility_of_element_located((By.XPATH, node["parameters"]["xpath"])))
  353. # 根据不同选项执行不同操作
  354. if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
  355. for i in node["sequence"]: # 从根节点开始向下读取
  356. self.executeNode(i, loopValue, loopPath, index)
  357. elif node["option"] == 1: # 打开网页操作
  358. self.recordLog("openPage")
  359. self.openPage(node["parameters"], loopValue)
  360. elif node["option"] == 2: # 点击元素
  361. self.recordLog("Click")
  362. self.clickElement(node["parameters"], loopValue, loopPath, index)
  363. elif node["option"] == 3: # 提取数据
  364. self.recordLog("getData")
  365. self.getData(node["parameters"], loopValue, node["isInLoop"],
  366. parentPath=loopPath, index=index)
  367. self.saveData()
  368. elif node["option"] == 4: # 输入文字
  369. self.inputInfo(node["parameters"], loopValue)
  370. elif node["option"] == 5: # 自定义操作
  371. self.customOperation(node, loopValue, loopPath, index)
  372. self.saveData()
  373. elif node["option"] == 6: # 切换下拉框
  374. self.switchSelect(node["parameters"], loopValue)
  375. elif node["option"] == 7: # 鼠标移动到元素上
  376. self.moveToElement(node["parameters"], loopValue, loopPath, index)
  377. elif node["option"] == 8: # 循环
  378. self.recordLog("loop")
  379. self.loopExecute(node, loopValue, loopPath, index) # 执行循环
  380. elif node["option"] == 9: # 条件分支
  381. self.recordLog("judge")
  382. self.judgeExecute(node, loopValue, loopPath, index)
  383. # 执行完之后进行等待
  384. if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
  385. waitTime = 0.01 # 默认等待0.01秒
  386. if node["parameters"]["wait"] >= 0:
  387. waitTime = node["parameters"]["wait"]
  388. try:
  389. waitType = int(node["parameters"]["waitType"])
  390. except:
  391. waitType = 0
  392. if waitType == 0: # 固定等待时间
  393. time.sleep(waitTime)
  394. elif waitType == 1: # 随机等待时间
  395. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  396. self.Log("Wait seconds after node executing: ", waitTime)
  397. self.event.wait() # 等待事件结束
  398. # 对判断条件的处理
  399. def judgeExecute(self, node, loopElement, clickPath="", index=0):
  400. executeBranchId = 0 # 要执行的BranchId
  401. for i in node["sequence"]:
  402. cnode = self.procedure[i] # 获得条件分支
  403. tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
  404. if tType == 0: # 什么条件都没有
  405. executeBranchId = i
  406. break
  407. elif tType == 1: # 当前页面包含文本
  408. try:
  409. if self.bodyText.find(cnode["parameters"]["value"]) >= 0:
  410. executeBranchId = i
  411. break
  412. except: # 找不到元素下一个条件
  413. continue
  414. elif tType == 2: # 当前页面包含元素
  415. try:
  416. if self.browser.find_element(By.XPATH, cnode["parameters"]["value"]):
  417. executeBranchId = i
  418. break
  419. except: # 找不到元素或者xpath写错了,下一个条件
  420. continue
  421. elif tType == 3: # 当前循环元素包括文本
  422. try:
  423. if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
  424. executeBranchId = i
  425. break
  426. except: # 找不到元素或者xpath写错了,下一个条件
  427. continue
  428. elif tType == 4: # 当前循环元素包括元素
  429. try:
  430. if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
  431. executeBranchId = i
  432. break
  433. except: # 找不到元素或者xpath写错了,下一个条件
  434. continue
  435. elif tType <= 7: # JS命令返回值
  436. if tType == 5: # JS命令返回值等于
  437. output = self.execute_code(0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"])
  438. elif tType == 6: # System
  439. output = self.execute_code(1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"])
  440. elif tType == 7: # 针对当前循环项的JS命令返回值
  441. output = self.execute_code(2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement)
  442. try:
  443. if output.find("rue") != -1: # 如果返回值中包含true
  444. code = 1
  445. else:
  446. code = int(output)
  447. except:
  448. code = 0
  449. if code > 0:
  450. executeBranchId = i
  451. break
  452. # rt.end()
  453. if executeBranchId != 0:
  454. self.executeNode(executeBranchId, loopElement, clickPath, index)
  455. # 对循环的处理
  456. def loopExecute(self, node, loopValue, clickPath="", index=0):
  457. time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
  458. # self.Log("循环执行前等待0.1秒")
  459. self.Log("Wait 0.1 second before loop")
  460. thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
  461. thisHistoryLength = self.browser.execute_script(
  462. 'return history.length') # 记录本次循环内的history的length
  463. self.history["index"] = thisHistoryLength
  464. self.history["handle"] = thisHandle
  465. if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
  466. # 无跳转标签页操作
  467. count = 0 # 执行次数
  468. while True: # do while循环
  469. try:
  470. finished = False
  471. element = self.browser.find_element(
  472. By.XPATH, node["parameters"]["xpath"])
  473. for i in node["sequence"]: # 挨个执行操作
  474. self.executeNode(i, element, node["parameters"]["xpath"], 0)
  475. finished = True
  476. self.Log("click: ", node["parameters"]["xpath"])
  477. self.recordLog("click:" + node["parameters"]["xpath"])
  478. except NoSuchElementException:
  479. # except:
  480. print("Single loop element not found: ", node["parameters"]["xpath"])
  481. print("找不到要循环的单个元素: ", node["parameters"]["xpath"])
  482. self.recordLog("Single loop element not found: " + node["parameters"]["xpath"])
  483. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  484. if node["option"] != 2:
  485. self.executeNode(i, None, node["parameters"]["xpath"], 0)
  486. finished = True
  487. break # 如果找不到元素,退出循环
  488. finally:
  489. if not finished:
  490. print("\n\n-------Retrying-------\n\n")
  491. self.Log("-------Retrying-------: ",
  492. node["parameters"]["xpath"])
  493. self.recordLog("clickNotFound:" + node["parameters"]["xpath"])
  494. for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
  495. if node["option"] != 2:
  496. self.executeNode(i, None, node["parameters"]["xpath"], 0)
  497. break # 如果找不到元素,退出循环
  498. count = count + 1
  499. self.Log("Page: ", count)
  500. self.recordLog("Page:" + str(count))
  501. # print(node["parameters"]["exitCount"], "-------")
  502. if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
  503. break
  504. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  505. output = self.execute_code(int(node["parameters"]["breakMode"]) -1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"])
  506. code = get_output_code(output)
  507. if code <= 0:
  508. break
  509. elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
  510. try:
  511. elements = self.browser.find_elements(By.XPATH,
  512. node["parameters"]["xpath"])
  513. if len(elements) == 0:
  514. print("Loop element not found: ", node["parameters"]["xpath"])
  515. print("找不到循环元素: ", node["parameters"]["xpath"])
  516. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  517. for index in range(len(elements)):
  518. for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
  519. self.executeNode(i, elements[index],
  520. node["parameters"]["xpath"], index)
  521. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  522. while True: # 一直关闭窗口直到当前标签页
  523. self.browser.close() # 关闭使用完的标签页
  524. self.browser.switch_to.window(self.browser.window_handles[-1])
  525. if self.browser.current_window_handle == thisHandle:
  526. break
  527. if self.history["index"] != thisHistoryLength and self.history[
  528. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  529. difference = thisHistoryLength - \
  530. self.history["index"] # 计算历史记录变化差值
  531. self.browser.execute_script(
  532. 'history.go(' + str(difference) + ')') # 回退历史记录
  533. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  534. time.sleep(node["parameters"]["historyWait"])
  535. # else:
  536. # time.sleep(2)
  537. # 切换历史记录等待2秒或者:
  538. self.Log("Change history back time or:",
  539. node["parameters"]["historyWait"])
  540. self.browser.execute_script('window.stop()')
  541. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  542. output = self.execute_code(int(node["parameters"]["breakMode"]) -1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"])
  543. code = get_output_code(output)
  544. if code <= 0:
  545. break
  546. except NoSuchElementException:
  547. print("Loop element not found: ", node["parameters"]["xpath"])
  548. print("找不到循环元素: ", node["parameters"]["xpath"])
  549. self.recordLog("pathNotFound: " + node["parameters"]["xpath"])
  550. except Exception as e:
  551. raise
  552. elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
  553. for path in node["parameters"]["pathList"].split("\n"): # 千万不要忘了分割!!
  554. try:
  555. element = self.browser.find_element(By.XPATH, path)
  556. for i in node["sequence"]: # 挨个执行操作
  557. self.executeNode(i, element, path, 0)
  558. if self.browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
  559. while True: # 一直关闭窗口直到当前标签页
  560. self.browser.close() # 关闭使用完的标签页
  561. self.browser.switch_to.window(self.browser.window_handles[-1])
  562. if self.browser.current_window_handle == thisHandle:
  563. break
  564. if self.history["index"] != thisHistoryLength and self.history[
  565. "handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
  566. difference = thisHistoryLength - \
  567. self.history["index"] # 计算历史记录变化差值
  568. self.browser.execute_script(
  569. 'history.go(' + str(difference) + ')') # 回退历史记录
  570. # if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
  571. time.sleep(node["parameters"]["historyWait"])
  572. # else:
  573. # time.sleep(2)
  574. self.Log("Change history back time or:",
  575. node["parameters"]["historyWait"])
  576. self.browser.execute_script('window.stop()')
  577. except NoSuchElementException:
  578. print("Loop element not found: ", path)
  579. print("找不到循环元素: ", path)
  580. self.recordLog("pathNotFound: " + path)
  581. continue # 循环中找不到元素就略过操作
  582. except Exception as e:
  583. raise
  584. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  585. output = self.execute_code(int(node["parameters"]["breakMode"]) -1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"])
  586. code = get_output_code(output)
  587. if code <= 0:
  588. break
  589. elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
  590. textList = node["parameters"]["textList"].split("\n")
  591. for text in textList:
  592. self.recordLog("input: " + text)
  593. for i in node["sequence"]: # 挨个执行操作
  594. self.executeNode(i, text, "", 0)
  595. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  596. output = self.execute_code(int(node["parameters"]["breakMode"]) -1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"])
  597. code = get_output_code(output)
  598. if code <= 0:
  599. break
  600. elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
  601. # tempList = node["parameters"]["textList"].split("\r\n")
  602. urlList = list(
  603. filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
  604. # urlList = []
  605. # for url in tempList:
  606. # if url != "":
  607. # urlList.append(url)
  608. for url in urlList:
  609. self.recordLog("input: " + url)
  610. for i in node["sequence"]:
  611. self.executeNode(i, url, "", 0)
  612. if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
  613. output = self.execute_code(int(node["parameters"]["breakMode"]) -1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"])
  614. code = get_output_code(output)
  615. if code <= 0:
  616. break
  617. elif int(node["parameters"]["loopType"]) <= 6: # 命令返回值
  618. while True: # do while循环
  619. if int(node["parameters"]["loopType"]) == 5: # JS
  620. output = self.execute_code(0, node["parameters"]["code"], node["parameters"]["waitTime"])
  621. elif int(node["parameters"]["loopType"]) == 6: # System
  622. output = self.execute_code(1, node["parameters"]["code"], node["parameters"]["waitTime"])
  623. code = get_output_code(output)
  624. if code <= 0:
  625. break
  626. for i in node["sequence"]: # 挨个执行操作
  627. self.executeNode(i, code, node["parameters"]["xpath"], 0)
  628. self.history["index"] = thisHistoryLength
  629. self.history["handle"] = self.browser.current_window_handle
  630. self.scrollDown(node["parameters"])
  631. # 打开网页事件
  632. def openPage(self, para, loopValue):
  633. time.sleep(1) # 打开网页后强行等待至少1秒
  634. if len(self.browser.window_handles) > 1:
  635. self.browser.switch_to.window(self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
  636. self.browser.close()
  637. self.browser.switch_to.window(self.browser.window_handles[0]) # 打开网页操作从第1个页面开始
  638. self.history["handle"] = self.browser.current_window_handle
  639. if para["useLoop"]:
  640. url = loopValue
  641. elif para["url"] != "about:blank":
  642. url = self.links[self.urlId]
  643. # clear output parameters
  644. for key in self.outputParameters:
  645. self.outputParameters[key] = ""
  646. else:
  647. url = list(filter(isnull, para["links"].split("\n")))[0]
  648. pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
  649. try:
  650. replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), url)
  651. except:
  652. replaced_text = url
  653. url = replaced_text
  654. try:
  655. maxWaitTime = int(para["maxWaitTime"])
  656. except:
  657. maxWaitTime = 10 # 默认最大等待时间为10秒
  658. try:
  659. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  660. self.browser.set_script_timeout(maxWaitTime)
  661. self.browser.get(url)
  662. self.Log('Loading page: ' + url)
  663. self.recordLog('Loading page: ' + url)
  664. except TimeoutException:
  665. self.Log('time out after set seconds when loading page: ' + url)
  666. self.recordLog('time out after set seconds when loading page: ' + url)
  667. self.browser.execute_script('window.stop()')
  668. try:
  669. self.history["index"] = self.browser.execute_script("return history.length")
  670. except TimeoutException:
  671. self.browser.execute_script('window.stop()')
  672. self.history["index"] = self.browser.execute_script("return history.length")
  673. self.scrollDown(para) # 控制屏幕向下滚动
  674. if self.containJudge:
  675. try:
  676. self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
  677. self.Log('URL Page: ' + url)
  678. self.recordLog('URL Page: ' + url)
  679. except TimeoutException:
  680. self.Log('Time out after set seconds when getting body text: ' + url)
  681. self.recordLog('Time out after set seconds when getting body text:: ' + url)
  682. self.browser.execute_script('window.stop()')
  683. time.sleep(1)
  684. self.Log("Need to wait 1 second to get body text")
  685. # 再执行一遍
  686. self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
  687. except Exception as e:
  688. self.Log(e)
  689. self.recordLog(str(e))
  690. # 键盘输入事件
  691. def inputInfo(self, para, loopValue):
  692. time.sleep(0.1) # 输入之前等待0.1秒
  693. self.Log("Wait 0.1 second before input")
  694. try:
  695. textbox = self.browser.find_element(By.XPATH, para["xpath"])
  696. # textbox.send_keys(Keys.CONTROL, 'a')
  697. # textbox.send_keys(Keys.BACKSPACE)
  698. self.execute_code(2, para["beforeJS"], para["beforeJSWaitTime"], textbox) # 执行前置JS
  699. # Send the HOME key
  700. textbox.send_keys(Keys.HOME)
  701. # Send the SHIFT + END key combination
  702. textbox.send_keys(Keys.SHIFT, Keys.END)
  703. # Send the DELETE key
  704. textbox.send_keys(Keys.DELETE)
  705. value = ""
  706. if para["useLoop"]:
  707. value = loopValue
  708. else:
  709. value = para["value"]
  710. pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
  711. try:
  712. replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
  713. replaced_text = re.sub('<enter>', '', replaced_text, flags=re.IGNORECASE)
  714. except:
  715. replaced_text = value
  716. textbox.send_keys(replaced_text)
  717. if value.lower().find("<enter>") >= 0:
  718. textbox.send_keys(Keys.ENTER)
  719. self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], textbox) # 执行后置js
  720. # global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
  721. self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
  722. except:
  723. print("Cannot find input box element:" +
  724. para["xpath"] + ", please try to set the wait time before executing this operation")
  725. print("找不到输入框元素:" + para["xpath"] + ",请尝试在执行此操作前设置等待时间")
  726. self.recordLog("Cannot find input box element:" +
  727. para["xpath"] + "Please try to set the wait time before executing this operation")
  728. # 点击元素事件
  729. def clickElement(self, para, loopElement=None, clickPath="", index=0):
  730. time.sleep(0.1) # 点击之前等待0.1秒
  731. self.Log("Wait 0.1 second before clicking element")
  732. if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
  733. path = clickPath
  734. else:
  735. path = para["xpath"] # 不然使用元素定义的xpath
  736. try:
  737. maxWaitTime = int(para["maxWaitTime"])
  738. except:
  739. maxWaitTime = 10
  740. self.browser.set_page_load_timeout(maxWaitTime) # 加载页面最大超时时间
  741. self.browser.set_script_timeout(maxWaitTime)
  742. # 点击前对该元素执行一段JavaScript代码
  743. try:
  744. element = self.browser.find_element(By.XPATH, path)
  745. if para["beforeJS"] != "":
  746. self.execute_code(2, para["beforeJS"], para["beforeJSWaitTime"], element)
  747. except:
  748. print("Cannot find element:" +
  749. path + ", please try to set the wait time before executing this operation")
  750. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  751. self.recordLog("Cannot find element:" +
  752. path + ", please try to set the wait time before executing this operation")
  753. tempHandleNum = len(self.browser.window_handles) # 记录之前的窗口位置
  754. try:
  755. click_way = int(para["clickWay"])
  756. except:
  757. click_way = 0
  758. try:
  759. if click_way == 0: # 用selenium的点击方法
  760. actions = ActionChains(self.browser) # 实例化一个action对象
  761. actions.click(element).perform()
  762. elif click_way == 1: # 用js的点击方法
  763. script = 'var result = document.evaluate(`' + path + \
  764. '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
  765. self.browser.execute_script(script, str(index)) # 用js的点击方法
  766. except TimeoutException:
  767. self.Log('time out after set seconds when loading clicked page')
  768. self.recordLog('time out after set seconds when loading clicked page')
  769. self.browser.execute_script('window.stop()')
  770. except Exception as e:
  771. self.Log(e)
  772. self.recordLog(str(e))
  773. # 点击前对该元素执行一段JavaScript代码
  774. try:
  775. if para["afterJS"] != "":
  776. element = self.browser.find_element(By.XPATH, path)
  777. self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], element)
  778. except:
  779. print("Cannot find element:" + path)
  780. self.recordLog("Cannot find element:" +
  781. path + ", please try to set the wait time before executing this operation")
  782. print("找不到要点击的元素:" + path + ",请尝试在执行此操作前设置等待时间")
  783. waitTime = float(para["wait"]) + 0.01 # 点击之后等待
  784. try:
  785. waitType = int(para["waitType"])
  786. except:
  787. waitType = 0
  788. if waitType == 0: # 固定等待时间
  789. time.sleep(waitTime)
  790. elif waitType == 1: # 随机等待时间
  791. time.sleep(random.uniform(waitTime * 0.5, waitTime * 1.5))
  792. if tempHandleNum != len(self.browser.window_handles): # 如果有新标签页的行为发生
  793. self.browser.switch_to.window(self.browser.window_handles[-1]) # 跳转到新的标签页
  794. self.history["handle"] = self.browser.current_window_handle
  795. try:
  796. self.history["index"] = self.browser.execute_script("return history.length")
  797. except TimeoutException:
  798. self.browser.execute_script('window.stop()')
  799. self.history["index"] = self.browser.execute_script("return history.length")
  800. else:
  801. try:
  802. self.history["index"] = self.browser.execute_script("return history.length")
  803. except TimeoutException:
  804. self.browser.execute_script('window.stop()')
  805. self.history["index"] = self.browser.execute_script("return history.length")
  806. # 如果打开了新窗口,切换到新窗口
  807. self.scrollDown(para) # 根据参数配置向下滚动
  808. if self.containJudge: # 有判断语句才执行以下操作
  809. # global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
  810. try:
  811. self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
  812. except TimeoutException:
  813. self.Log('time out after 10 seconds when getting body text')
  814. self.recordLog('time out after 10 seconds when getting body text')
  815. self.browser.execute_script('window.stop()')
  816. time.sleep(1)
  817. self.Log("wait one second after get body text")
  818. # 再执行一遍
  819. self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
  820. # rt.end()
  821. except Exception as e:
  822. self.Log(e)
  823. self.recordLog(str(e))
  824. # rt.end()
  825. def get_content(self, p, element):
  826. content = ""
  827. if p["contentType"] == 0:
  828. # 先处理特殊节点类型
  829. if p["nodeType"] == 2:
  830. if element.get_attribute("href") != None:
  831. content = element.get_attribute("href")
  832. else:
  833. content = ""
  834. elif p["nodeType"] == 3:
  835. if element.get_attribute("value") != None:
  836. content = element.get_attribute("value")
  837. else:
  838. content = ""
  839. elif p["nodeType"] == 4: # 图片
  840. if element.get_attribute("src") != None:
  841. content = element.get_attribute("src")
  842. else:
  843. content = ""
  844. try:
  845. downloadPic = p["downloadPic"]
  846. except:
  847. downloadPic = 0
  848. if downloadPic == 1:
  849. download_image(content, "Data/" + self.saveName + "/")
  850. else: # 普通节点
  851. content = element.text
  852. elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
  853. if p["nodeType"] == 2:
  854. if element.get_attribute("href") != None:
  855. content = element.get_attribute("href")
  856. else:
  857. content = ""
  858. elif p["nodeType"] == 3:
  859. if element.get_attribute("value") != None:
  860. content = element.get_attribute("value")
  861. else:
  862. content = ""
  863. elif p["nodeType"] == 4: # 图片
  864. if element.get_attribute("src") != None:
  865. content = element.get_attribute("src")
  866. else:
  867. content = ""
  868. try:
  869. downloadPic = p["downloadPic"]
  870. except:
  871. downloadPic = 0
  872. if downloadPic == 1:
  873. download_image(content, "Data/" + self.saveName + "/")
  874. else:
  875. command = 'var arr = [];\
  876. var content = arguments[0];\
  877. for(var i = 0, len = content.childNodes.length; i < len; i++) {\
  878. if(content.childNodes[i].nodeType === 3){ \
  879. arr.push(content.childNodes[i].nodeValue);\
  880. }\
  881. }\
  882. var str = arr.join(" "); \
  883. return str;'
  884. content = self.browser.execute_script(command, element).replace(
  885. "\n", "").replace("\\s+", " ")
  886. elif p["contentType"] == 2:
  887. content = element.get_attribute('innerHTML')
  888. elif p["contentType"] == 3:
  889. content = element.get_attribute('outerHTML')
  890. elif p["contentType"] == 4:
  891. # 获取元素的背景图片地址
  892. bg_url = element.value_of_css_property('background-image')
  893. # 清除背景图片地址中的多余字符
  894. bg_url = bg_url.replace('url("', '').replace('")', '')
  895. content = bg_url
  896. elif p["contentType"] == 5:
  897. content = self.browser.current_url
  898. elif p["contentType"] == 6:
  899. content = self.browser.title
  900. elif p["contentType"] == 7:
  901. # 获取整个网页的高度和宽度
  902. height = self.browser.execute_script("return document.body.scrollHeight");
  903. width = self.browser.execute_script("return document.body.scrollWidth");
  904. # 调整浏览器窗口的大小
  905. self.browser.set_window_size(width, height)
  906. element.screenshot("Data/" + self.saveName + "/"+ str(time.time()) + ".png")
  907. elif p["contentType"] == 8:
  908. try:
  909. screenshot = element.screenshot_as_png
  910. screenshot_stream = io.BytesIO(screenshot)
  911. # 使用Pillow库打开截图,并转换为灰度图像
  912. image = Image.open(screenshot_stream).convert('L')
  913. # 使用Tesseract OCR引擎识别图像中的文本
  914. text = pytesseract.image_to_string(image, lang='chi_sim+eng')
  915. content = text
  916. except Exception as e:
  917. content = "OCR Error"
  918. print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
  919. if sys.platform == "win32":
  920. print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  921. elif sys.platform == "darwin":
  922. print(e)
  923. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
  924. elif sys.platform == "linux":
  925. print(e)
  926. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
  927. else:
  928. print(e)
  929. print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
  930. elif p["contentType"] == 9:
  931. content = self.execute_code(2, p["JS"], p["JSWaitTime"], element)
  932. elif p["contentType"] == 10: # 下拉框选中的值
  933. try:
  934. select_element = Select(element)
  935. content = select_element.first_selected_option.get_attribute("value")
  936. except:
  937. content = ""
  938. elif p["contentType"] == 11: # 下拉框选中的文本
  939. try:
  940. select_element = Select(element)
  941. content = select_element.first_selected_option.text
  942. except:
  943. content = ""
  944. return content
  945. # 提取数据事件
  946. def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
  947. pageHTML = etree.HTML(self.browser.page_source)
  948. try:
  949. loopElementOuterHTML = loopElement.get_attribute('outerHTML')
  950. except:
  951. loopElementOuterHTML = ""
  952. loopElementHTML = etree.HTML(loopElementOuterHTML)
  953. for p in para["paras"]:
  954. if p["optimizable"]:
  955. try:
  956. p["relativeXPath"] = p["relativeXPath"].lower()
  957. if p["nodeType"] == 2:
  958. xpath = p["relativeXPath"] + "/@href"
  959. elif p["contentType"] == 1:
  960. xpath = p["relativeXPath"] + "/text()"
  961. elif p["contentType"] == 0:
  962. xpath = p["relativeXPath"] + "//text()"
  963. if p["relative"]:
  964. # if p["relativeXPath"] == "":
  965. # content = [loopElementHTML]
  966. # else:
  967. if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
  968. full_path = "(" + parentPath + \
  969. xpath + ")" + \
  970. "[" + str(index + 1) + "]"
  971. content = pageHTML.xpath(full_path)
  972. else:
  973. content = loopElementHTML.xpath("/html/body/" + loopElementHTML[0][0].tag + xpath)
  974. else:
  975. if xpath.find("/html/body") < 0:
  976. xpath = "/html/body" + xpath
  977. content = pageHTML.xpath(xpath)
  978. if len(content) > 0:
  979. # html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
  980. # 拼接所有文本内容并去掉两边的空白
  981. content = ' '.join(result.strip() for result in content if result.strip())
  982. else:
  983. content = p["default"]
  984. try:
  985. if not self.dataNotFoundKeys[p["name"]]:
  986. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
  987. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
  988. self.dataNotFoundKeys[p["name"]] = True
  989. self.recordLog('Element %s not found, use default' % p["relativeXPath"])
  990. except:
  991. pass
  992. except Exception as e:
  993. print(e)
  994. self.outputParameters[p["name"]] = content
  995. # 对于不能优化的操作,使用selenium执行
  996. for p in para["paras"]:
  997. if not p["optimizable"]:
  998. content = ""
  999. if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
  1000. try:
  1001. p["relativeXPath"] = p["relativeXPath"].lower()
  1002. if p["relative"]: # 是否相对xpath
  1003. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1004. element = loopElement
  1005. else:
  1006. if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
  1007. full_path = "(" + parentPath + \
  1008. p["relativeXPath"] + ")" + \
  1009. "[" + str(index + 1) + "]"
  1010. element = self.browser.find_element(By.XPATH, full_path)
  1011. else:
  1012. element = loopElement.find_element(By.XPATH,
  1013. p["relativeXPath"][1:])
  1014. else:
  1015. element = self.browser.find_element(By.XPATH, p["relativeXPath"])
  1016. except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
  1017. # print(p)
  1018. try:
  1019. content = p["default"]
  1020. except Exception as e:
  1021. content = ""
  1022. self.outputParameters[p["name"]] = content
  1023. try:
  1024. if not self.dataNotFoundKeys[p["name"]]:
  1025. print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
  1026. print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
  1027. self.dataNotFoundKeys[p["name"]] = True
  1028. self.recordLog('Element %s not found, use default' % p["relativeXPath"])
  1029. except:
  1030. pass
  1031. continue
  1032. except TimeoutException: # 超时的时候设置超时值
  1033. self.Log('time out after set seconds when getting data')
  1034. self.recordLog('time out after set seconds when getting data')
  1035. self.browser.execute_script('window.stop()')
  1036. if p["relative"]: # 是否相对xpath
  1037. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1038. element = loopElement
  1039. else:
  1040. element = loopElement.find_element(By.XPATH,
  1041. p["relativeXPath"][1:])
  1042. else:
  1043. element = self.browser.find_element(By.XPATH, p["relativeXPath"])
  1044. # rt.end()
  1045. else:
  1046. element = self.browser.find_element(By.XPATH, "//body")
  1047. try:
  1048. self.execute_code(2, p["beforeJS"], p["beforeJSWaitTime"], element) # 执行前置js
  1049. content = self.get_content(p, element)
  1050. except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
  1051. self.recordLog('StaleElementReferenceException: '+p["relativeXPath"])
  1052. time.sleep(3)
  1053. try:
  1054. if p["relative"]: # 是否相对xpath
  1055. if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
  1056. element = loopElement
  1057. self.recordLog('StaleElementReferenceException: loopElement')
  1058. else:
  1059. element = loopElement.find_element(By.XPATH,
  1060. p["relativeXPath"][1:])
  1061. self.recordLog(
  1062. 'StaleElementReferenceException: loopElement+relativeXPath')
  1063. else:
  1064. element = self.browser.find_element(
  1065. By.XPATH, p["relativeXPath"])
  1066. self.recordLog('StaleElementReferenceException: relativeXPath')
  1067. content = self.get_content(p, element)
  1068. except StaleElementReferenceException:
  1069. self.recordLog('StaleElementReferenceException: '+p["relativeXPath"])
  1070. continue # 再出现类似问题直接跳过
  1071. self.outputParameters[p["name"]] = content
  1072. self.execute_code(2, p["afterJS"], p["afterJSWaitTime"], element) # 执行后置JS
  1073. line = []
  1074. for value in self.outputParameters.values():
  1075. line.append(value)
  1076. print(value[:15], " ", end="")
  1077. print("")
  1078. self.OUTPUT.append(line)
  1079. # rt.end()
  1080. if __name__ == '__main__':
  1081. config = {
  1082. "id": [0],
  1083. "saved_file_name": "",
  1084. "user_data": False,
  1085. "config_folder": "",
  1086. "config_file_name": "config.json",
  1087. "read_type": "remote",
  1088. "headless": False,
  1089. "server_address": "http://localhost:8074",
  1090. "version": "0.3.3",
  1091. }
  1092. c = Config(config)
  1093. print(c)
  1094. options = Options()
  1095. driver_path = "chromedriver.exe"
  1096. import platform
  1097. print(sys.platform, platform.architecture())
  1098. option = webdriver.ChromeOptions()
  1099. if not os.path.exists(os.getcwd()+"/Data"):
  1100. os.mkdir(os.getcwd()+"/Data")
  1101. if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
  1102. options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1103. # MacOS需要用option而不是options!
  1104. option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
  1105. driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
  1106. # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1107. # # MacOS需要用option而不是options!
  1108. # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
  1109. # driver_path = os.getcwd()+ "/chromedriver_mac64"
  1110. print(driver_path)
  1111. elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
  1112. print("Finding chromedriver in EasySpider",
  1113. os.getcwd()+"/EasySpider")
  1114. if sys.platform == "win32" and platform.architecture()[0] == "32bit":
  1115. options.binary_location = os.path.join(
  1116. os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
  1117. driver_path = os.path.join(
  1118. os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
  1119. elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
  1120. options.binary_location = os.path.join(
  1121. os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
  1122. driver_path = os.path.join(
  1123. os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
  1124. elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
  1125. options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
  1126. driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
  1127. else:
  1128. print("Unsupported platform")
  1129. sys.exit()
  1130. print("Chrome location:", options.binary_location)
  1131. print("Chromedriver location:", driver_path)
  1132. # elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
  1133. # print("Finding chromedriver in ./Chrome",
  1134. # os.getcwd()+"/Chrome")
  1135. # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
  1136. # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
  1137. # driver_path = "./Chrome/chromedriver.exe"
  1138. elif os.path.exists(os.getcwd()+"/../ElectronJS"):
  1139. if os.getcwd().find("ElectronJS") >= 0: # 软件dev用
  1140. print("Finding chromedriver in EasySpider",
  1141. os.getcwd())
  1142. option.binary_location = "chrome_win64/chrome.exe"
  1143. driver_path = "chrome_win64/chromedriver_win64.exe"
  1144. else: # 直接在executeStage文件夹内使用python easyspider_executestage.py时的路径
  1145. print("Finding chromedriver in EasySpider",
  1146. os.getcwd()+"/ElectronJS")
  1147. option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
  1148. driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
  1149. else:
  1150. options.binary_location = "./chrome.exe" # 指定chrome位置
  1151. driver_path = "./chromedriver.exe"
  1152. option.add_experimental_option(
  1153. 'excludeSwitches', ['enable-automation']) # 以开发者模式
  1154. options.add_argument('-ignore-certificate-errors')
  1155. options.add_argument('-ignore -ssl-errors')
  1156. option.add_argument('-ignore-certificate-errors')
  1157. option.add_argument('-ignore -ssl-errors')
  1158. # user_data_dir = r'' # 注意没有Default!
  1159. # options.add_argument('--user-data-dir='+p)
  1160. # 总结:
  1161. # 0. 带Cookie需要用userdatadir
  1162. # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
  1163. # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
  1164. # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
  1165. # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
  1166. if c.user_data:
  1167. with open(c.config_folder + c.config_file_name,"r", encoding='utf-8') as f:
  1168. config = json.load(f)
  1169. absolute_user_data_folder = config["absolute_user_data_folder"]
  1170. print("\nAbsolute_user_data_folder:",absolute_user_data_folder,"\n")
  1171. option.add_argument(f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
  1172. option.add_argument("--profile-directory=Default")
  1173. if c.headless:
  1174. print("Headless mode")
  1175. print("无头模式")
  1176. option.add_argument("--headless")
  1177. options.add_argument("--headless")
  1178. # options.add_argument(
  1179. # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
  1180. option.add_argument(
  1181. "--disable-blink-features=AutomationControlled") # TMALL 反扒
  1182. options.add_argument("--disable-blink-features=AutomationControlled") # TMALL 反扒
  1183. threads = []
  1184. for i in c.id:
  1185. print(options)
  1186. print("id: ", i)
  1187. if c.saved_file_name != "":
  1188. saveName = "task_" + str(i) + "_" + c.saved_file_name # 保存文件的名字
  1189. else:
  1190. saveName = "task_" + str(i) + "_" + \
  1191. str(random.randint(0, 999999999)) # 保存文件的名字
  1192. print("Save Name for task ID", i, "is:", saveName)
  1193. print("任务ID", i, "的保存文件名为:", saveName)
  1194. os.mkdir("Data/" + saveName) # 创建保存文件夹用来保存截图
  1195. # with open("Data/" + saveName + "_control.txt", "w", encoding='utf-8') as f:
  1196. # f.write("1")
  1197. # f.close()
  1198. if c.read_type == "remote":
  1199. print("remote")
  1200. content = requests.get(c.server_address + "/queryExecutionInstance?id=" + str(i))
  1201. service = json.loads(content.text) # 加载服务信息
  1202. else:
  1203. print("local")
  1204. with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f:
  1205. content = f.read()
  1206. service = json.loads(content) # 加载服务信息
  1207. print("Task Name:", service["name"])
  1208. print("任务名称:", service["name"])
  1209. try:
  1210. cloudflare = service["cloudflare"]
  1211. except:
  1212. cloudflare = 0
  1213. if cloudflare == 0:
  1214. options.add_experimental_option("prefs", {
  1215. "download.default_directory": "Data/", # 设置文件下载路径
  1216. "download.prompt_for_download": False, # 禁止下载提示框
  1217. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1218. "download.directory_upgrade": True,
  1219. "download.extensions_to_open": "applications/pdf",
  1220. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1221. })
  1222. option.add_experimental_option("prefs", {
  1223. "download.default_directory": "Data/", # 设置文件下载路径
  1224. "download.prompt_for_download": False, # 禁止下载提示框
  1225. "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
  1226. "download.directory_upgrade": True,
  1227. "download.extensions_to_open": "applications/pdf",
  1228. "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
  1229. })
  1230. try:
  1231. if service["environment"] == 1:
  1232. option.add_experimental_option('mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1233. options.add_experimental_option('mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
  1234. except:
  1235. pass
  1236. browser_t = webdriver.Chrome(
  1237. options=options, chrome_options=option, executable_path=driver_path)
  1238. elif cloudflare == 1:
  1239. browser_t = uc.Chrome(
  1240. options=options, chrome_options=option, executable_path=driver_path)
  1241. print("Pass Cloudflare Mode")
  1242. print("过Cloudflare验证模式")
  1243. event = Event()
  1244. event.set()
  1245. thread = BrowserThread(browser_t, i, service, c.version, event)
  1246. print("Thread with task id: ", i, " is created")
  1247. threads.append(thread)
  1248. thread.start()
  1249. Thread(target=check_file, args=("Data/" + saveName + "_control.txt", event)).start()
  1250. time.sleep(5)
  1251. print("\n\n----------------------------------")
  1252. print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
  1253. print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
  1254. print("----------------------------------\n\n")
  1255. for thread in threads:
  1256. thread.join()
  1257. for thread in threads:
  1258. thread.browser.quit()
  1259. print("Thread with task id: ", thread.id, " is closed")