Browse Source

Official Version of 0.6.0

naibo 1 year ago
parent
commit
ed0768ca51
49 changed files with 291 additions and 36 deletions
  1. 118 28
      .temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py
  2. 151 1
      .temp_to_pub/EasySpider_windows_x64/Code/utils.py
  3. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/0.json
  4. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/1.json
  5. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
  6. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
  7. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
  8. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
  9. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
  10. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
  11. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
  12. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
  13. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
  14. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
  15. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
  16. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
  17. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
  18. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
  19. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
  20. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
  21. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
  22. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
  23. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
  24. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
  25. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
  26. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
  27. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
  28. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
  29. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
  30. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
  31. 0 0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
  32. 0 0
      .temp_to_pub/EasySpider_windows_x64/tasks/149.json
  33. 0 0
      .temp_to_pub/EasySpider_windows_x64/tasks/213.json
  34. 0 0
      .temp_to_pub/EasySpider_windows_x64/tasks/296.json
  35. 0 0
      .temp_to_pub/EasySpider_windows_x64/tasks/8.json
  36. 1 0
      .temp_to_pub/compress.py
  37. BIN
      ElectronJS/EasySpider_en.crx
  38. BIN
      ElectronJS/EasySpider_zh.crx
  39. 1 0
      ElectronJS/clean_and_release_win32.cmd
  40. 1 0
      ElectronJS/clean_and_release_win64.cmd
  41. 7 1
      ElectronJS/main.js
  42. 1 0
      ElectronJS/package_linux64.sh
  43. 1 0
      ElectronJS/package_macos.sh
  44. 1 1
      ElectronJS/src/taskGrid/FlowChart.html
  45. 1 1
      ElectronJS/src/taskGrid/FlowChart_CN.html
  46. 1 1
      ElectronJS/src/taskGrid/executeTask.html
  47. 1 1
      ExecuteStage/.vscode/launch.json
  48. 2 2
      ExecuteStage/easyspider_executestage.py
  49. 4 0
      ExecuteStage/utils.py

+ 118 - 28
.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py

@@ -6,8 +6,8 @@ import platform
 import shutil
 import shutil
 import string
 import string
 import undetected_chromedriver as uc
 import undetected_chromedriver as uc
-from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
-    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
+from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
+    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
 from myChrome import MyChrome
 from myChrome import MyChrome
 from threading import Thread, Event
 from threading import Thread, Event
 from PIL import Image
 from PIL import Image
@@ -47,10 +47,11 @@ import requests
 from ddddocr import DdddOcr
 from ddddocr import DdddOcr
 from urllib.parse import urljoin
 from urllib.parse import urljoin
 from lxml import etree, html
 from lxml import etree, html
+
 import onnxruntime
 import onnxruntime
 
 
 onnxruntime.set_default_logger_severity(3)  # 隐藏onnxruntime的日志
 onnxruntime.set_default_logger_severity(3)  # 隐藏onnxruntime的日志
-# import pandas as pd
+import pandas as pd
 # import numpy
 # import numpy
 # import pytesseract
 # import pytesseract
 # import uuid
 # import uuid
@@ -295,9 +296,13 @@ class BrowserThread(Thread):
                     except:
                     except:
                         pass
                         pass
                     try:
                     try:
-                        node["parameters"]["recordASField"] += param["recordASField"]
+                        node["parameters"]["recordASField"] = param["recordASField"]
+                    except:
+                        node["parameters"]["recordASField"] = 1
+                    try:
+                        splitLine = int(param["splitLine"])
                     except:
                     except:
-                        node["parameters"]["recordASField"] += 1
+                        param["splitLine"] = 0
                     if param["contentType"] == 8:
                     if param["contentType"] == 8:
                         self.print_and_log(
                         self.print_and_log(
                             "默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
                             "默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
@@ -333,6 +338,10 @@ class BrowserThread(Thread):
                 except:
                 except:
                     node["parameters"]["exitElement"] = "//body"
                     node["parameters"]["exitElement"] = "//body"
                 node["parameters"]["quickExtractable"] = False # 是否可以快速提取
                 node["parameters"]["quickExtractable"] = False # 是否可以快速提取
+                try:
+                    skipCount = node["parameters"]["skipCount"]
+                except:
+                    node["parameters"]["skipCount"] = 0
                 # 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
                 # 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
                 if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
                 if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
                     try:
                     try:
@@ -347,6 +356,8 @@ class BrowserThread(Thread):
                         node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
                         node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
                     else:
                     else:
                         node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
                         node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
+                    if node["parameters"]["skipCount"] > 0:
+                        node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
                     for param in params:
                     for param in params:
                         optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
                         optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
                         try:
                         try:
@@ -463,21 +474,51 @@ class BrowserThread(Thread):
         self.print_and_log(
         self.print_and_log(
             "Already read input parameters from Excel and overwrite the original input parameters.")
             "Already read input parameters from Excel and overwrite the original input parameters.")
 
 
+    def removeDuplicateData(self):
+        try:
+            removeDuplicateData = self.service["removeDuplicate"]
+        except:
+            removeDuplicateData = 0
+        if removeDuplicateData == 1:
+            self.print_and_log("正在去除重复数据,请稍后……")
+            self.print_and_log("Removing duplicate data, please wait...")
+            if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "json" or self.outputFormat == "xlsx":
+                file_name = "Data/Task_" + \
+                            str(self.id) + "/" + self.saveName + \
+                            '.' + self.outputFormat
+                if self.outputFormat == "csv" or self.outputFormat == "txt":
+                    df = pd.read_csv(file_name)
+                    df.drop_duplicates(inplace=True)
+                    df.to_csv(file_name, index=False)
+                elif self.outputFormat == "xlsx":
+                    df = pd.read_excel(file_name)
+                    df.drop_duplicates(inplace=True)
+                    df.to_excel(file_name, index=False)
+                elif self.outputFormat == "json":
+                    df = pd.read_json(file_name)
+                    df.drop_duplicates(inplace=True)
+                    df.to_json(file_name, orient="records", force_ascii=False)
+            elif self.outputFormat == "mysql":
+                self.mysql.remove_duplicate_data()
+            self.print_and_log("去重完成。")
+            self.print_and_log("Duplicate data removed.")
+
     def run(self):
     def run(self):
         # 挨个执行程序
         # 挨个执行程序
         for i in range(len(self.links)):
         for i in range(len(self.links)):
-            self.print_and_log("正在执行第", i + 1, "/ ", len(self.links), "个链接")
+            self.print_and_log("正在执行第", i + 1, "/", len(self.links), "个链接")
             self.print_and_log("Executing link", i + 1,
             self.print_and_log("Executing link", i + 1,
-                               "/ ", len(self.links))
+                               "/", len(self.links))
             self.executeNode(0)
             self.executeNode(0)
             self.urlId = self.urlId + 1
             self.urlId = self.urlId + 1
         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         # 如果目录为空,则删除该目录
         # 如果目录为空,则删除该目录
-        if not files:
-            os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
+        # if not files:
+        #     os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         self.print_and_log("Done!")
         self.print_and_log("Done!")
         self.print_and_log("执行完成!")
         self.print_and_log("执行完成!")
         self.saveData(exit=True)
         self.saveData(exit=True)
+        self.removeDuplicateData()
         if self.outputFormat == "mysql":
         if self.outputFormat == "mysql":
             self.mysql.close()
             self.mysql.close()
         try:
         try:
@@ -1115,10 +1156,18 @@ class BrowserThread(Thread):
                     if node["parameters"]["exitCount"] == 0:
                     if node["parameters"]["exitCount"] == 0:
                         # newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
                         # newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
                         # 用find_elements获取所有匹配到的文本
                         # 用find_elements获取所有匹配到的文本
-                        exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
-                        newBodyText = ""
-                        for exitElement in exitElements:
-                            newBodyText += exitElement.text
+                        try:
+                            exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
+                            newBodyText = ""
+                            for exitElement in exitElements:
+                                newBodyText += exitElement.text
+                        except Exception as e:
+                            self.print_and_log(f"设定的退出循环元素:{node['parameters']['exitElement']}的文本无法获取,本次循环将不再检测元素文本是否变化,将会继续执行,为解决此问题,您可以修改检测元素文本不变的元素为其他元素,或者将循环次数设定为固定次数大于0的值。")
+                            self.print_and_log(f"The text of the exit loop element set: {node['parameters']['exitElement']} cannot be obtained, this loop will no longer check whether the text of the element has changed, and will continue to execute. To solve this problem, you can modify the element whose text does not change to other elements, or set the number of loops to a fixed number greater than 0.")
+                            self.print_and_log(e)
+                            exitElements = []
+                            # newBodyText为随机文本,保证一直执行
+                            newBodyText = str(random.random())
                         if node["parameters"]["iframe"]:  # 如果标记了iframe
                         if node["parameters"]["iframe"]:  # 如果标记了iframe
                             iframes = self.browser.find_elements(
                             iframes = self.browser.find_elements(
                                 By.CSS_SELECTOR, "iframe", iframe=False)
                                 By.CSS_SELECTOR, "iframe", iframe=False)
@@ -1200,9 +1249,15 @@ class BrowserThread(Thread):
                 if len(elements) == 0:
                 if len(elements) == 0:
                     self.print_and_log("Loop element not found: ",
                     self.print_and_log("Loop element not found: ",
                                        xpath)
                                        xpath)
-                    self.print_and_log("找不到循环元素: ", xpath)
+                    self.print_and_log("找不到循环元素", xpath)
                 index = 0
                 index = 0
+                skipCount = node["parameters"]["skipCount"]
                 while index < len(elements):
                 while index < len(elements):
+                    if index < skipCount:
+                        index += 1
+                        self.print_and_log("跳过第" + str(index) + "个元素")
+                        self.print_and_log("Skip the " + str(index) + "th element")
+                        continue
                     try:
                     try:
                         element = elements[index]
                         element = elements[index]
                         element_text = element.text
                         element_text = element.text
@@ -1250,7 +1305,7 @@ class BrowserThread(Thread):
                     index = index + 1
                     index = index + 1
             except NoSuchElementException:
             except NoSuchElementException:
                 self.print_and_log("Loop element not found: ", xpath)
                 self.print_and_log("Loop element not found: ", xpath)
-                self.print_and_log("找不到循环元素: ", xpath)
+                self.print_and_log("找不到循环元素", xpath)
             except Exception as e:
             except Exception as e:
                 raise
                 raise
         elif int(node["parameters"]["loopType"]) == 2:  # 固定元素列表
         elif int(node["parameters"]["loopType"]) == 2:  # 固定元素列表
@@ -1258,7 +1313,13 @@ class BrowserThread(Thread):
             paths = node["parameters"]["pathList"].split("\n")
             paths = node["parameters"]["pathList"].split("\n")
             # for path in node["parameters"]["pathList"].split("\n"):
             # for path in node["parameters"]["pathList"].split("\n"):
             index = 0
             index = 0
+            skipCount = node["parameters"]["skipCount"]
             while index < len(paths):
             while index < len(paths):
+                if index < skipCount:
+                    index += 1
+                    self.print_and_log("跳过第" + str(index) + "个元素")
+                    self.print_and_log("Skip the " + str(index) + "th element")
+                    continue
                 path = paths[index]
                 path = paths[index]
                 try:
                 try:
                     path = replace_field_values(
                     path = replace_field_values(
@@ -1295,7 +1356,7 @@ class BrowserThread(Thread):
                     index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
                     index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
                 except NoSuchElementException:
                 except NoSuchElementException:
                     self.print_and_log("Loop element not found: ", path)
                     self.print_and_log("Loop element not found: ", path)
-                    self.print_and_log("找不到循环元素: ", path)
+                    self.print_and_log("找不到循环元素", path)
                     index += 1
                     index += 1
                     continue  # 循环中找不到元素就略过操作
                     continue  # 循环中找不到元素就略过操作
                 except Exception as e:
                 except Exception as e:
@@ -1314,7 +1375,14 @@ class BrowserThread(Thread):
             if len(textList) == 1:  # 如果固定文本列表只有一行,现在就可以替换变量
             if len(textList) == 1:  # 如果固定文本列表只有一行,现在就可以替换变量
                 textList = replace_field_values(
                 textList = replace_field_values(
                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
+            skipCount = node["parameters"]["skipCount"]
+            index = 0
             for text in textList:
             for text in textList:
+                if index < skipCount:
+                    index += 1
+                    self.print_and_log("跳过第" + str(index) + "个文本")
+                    self.print_and_log("Skip the " + str(index) + "th text")
+                    continue
                 text = replace_field_values(text, self.outputParameters, self)
                 text = replace_field_values(text, self.outputParameters, self)
                 # self.recordLog("当前循环文本|Current loop text:", text)
                 # self.recordLog("当前循环文本|Current loop text:", text)
                 for i in node["sequence"]:  # 挨个执行操作
                 for i in node["sequence"]:  # 挨个执行操作
@@ -1340,11 +1408,14 @@ class BrowserThread(Thread):
             if len(urlList) == 1:  # 如果固定网址列表只有一行,现在就可以替换变量
             if len(urlList) == 1:  # 如果固定网址列表只有一行,现在就可以替换变量
                 urlList = replace_field_values(
                 urlList = replace_field_values(
                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
-            # urlList = []
-            # for url in tempList:
-            #     if url != "":
-            #         urlList.append(url)
+            skipCount = node["parameters"]["skipCount"]
+            index = 0
             for url in urlList:
             for url in urlList:
+                if index < skipCount:
+                    index += 1
+                    self.print_and_log("跳过第" + str(index) + "个网址")
+                    self.print_and_log("Skip the " + str(index) + "th url")
+                    continue
                 url = replace_field_values(url, self.outputParameters, self)
                 url = replace_field_values(url, self.outputParameters, self)
                 # self.recordLog("当前循环网址|Current loop url:", url)
                 # self.recordLog("当前循环网址|Current loop url:", url)
                 for i in node["sequence"]:
                 for i in node["sequence"]:
@@ -1392,7 +1463,7 @@ class BrowserThread(Thread):
         self.history["handle"] = self.browser.current_window_handle
         self.history["handle"] = self.browser.current_window_handle
         self.scrollDown(node["parameters"])
         self.scrollDown(node["parameters"])
 
 
-    # 打开网页事件
+    # 打开网页操作
     def openPage(self, param, loopValue):
     def openPage(self, param, loopValue):
         time.sleep(1)  # 打开网页后强行等待至少1秒
         time.sleep(1)  # 打开网页后强行等待至少1秒
         if len(self.browser.window_handles) > 1:
         if len(self.browser.window_handles) > 1:
@@ -1457,7 +1528,7 @@ class BrowserThread(Thread):
             self.history["index"] = 0
             self.history["index"] = 0
         self.scrollDown(param)  # 控制屏幕向下滚动
         self.scrollDown(param)  # 控制屏幕向下滚动
 
 
-    # 键盘输入事件
+    # 键盘输入操作
     def inputInfo(self, param, loopValue):
     def inputInfo(self, param, loopValue):
         time.sleep(0.1)  # 输入之前等待0.1秒
         time.sleep(0.1)  # 输入之前等待0.1秒
         try:
         try:
@@ -1509,7 +1580,7 @@ class BrowserThread(Thread):
                                xpath + ", please try to set the wait time before executing this operation")
                                xpath + ", please try to set the wait time before executing this operation")
             self.print_and_log("找不到输入框元素:" + xpath + ",请尝试在执行此操作前设置等待时间")
             self.print_and_log("找不到输入框元素:" + xpath + ",请尝试在执行此操作前设置等待时间")
 
 
-    # 点击元素事件
+    # 点击元素操作
     def clickElement(self, param, loopElement=None, clickPath="", index=0):
     def clickElement(self, param, loopElement=None, clickPath="", index=0):
         try:
         try:
             maxWaitTime = int(param["maxWaitTime"])
             maxWaitTime = int(param["maxWaitTime"])
@@ -1525,7 +1596,10 @@ class BrowserThread(Thread):
                 clickPath, self.outputParameters, self)
                 clickPath, self.outputParameters, self)
             xpath = replace_field_values(
             xpath = replace_field_values(
                 param["xpath"], self.outputParameters, self)
                 param["xpath"], self.outputParameters, self)
-            if param["useLoop"]:  # 使用循环的情况下,传入的clickPath就是实际的xpath
+            if xpath.find("point(") >= 0:  # 如果xpath中包含point(),说明是相对坐标的点击
+                index = 0
+                path = "//body"
+            elif param["useLoop"]:  # 使用循环的情况下,传入的clickPath就是实际的xpath
                 if xpath == "":
                 if xpath == "":
                     path = clickPath
                     path = clickPath
                 else:
                 else:
@@ -1557,9 +1631,21 @@ class BrowserThread(Thread):
         try:
         try:
             newTab = int(param["newTab"])
             newTab = int(param["newTab"])
         except:
         except:
-            newTab = 1
+            newTab = 0
         try:
         try:
-            if click_way == 0:  # 用selenium的点击方法
+            if xpath.find("point(") >= 0:  # 如果xpath中包含point(),说明是相对坐标的点击
+                point = xpath.split("point(")[1].split(")")[0].split(",")
+                x = int(point[0])
+                y = int(point[1])
+                # try:
+                #     actions = ActionChains(self.browser)  # 实例化一个action对象
+                #     actions.move_to_element(element).perform()
+                #     actions.move_by_offset(x, y).perform()
+                #     actions.click().perform()
+                # except Exception as e:
+                script = "document.elementFromPoint(" + str(x) + "," + str(y) + ").click();"
+                self.browser.execute_script(script)
+            elif click_way == 0:  # 用selenium的点击方法
                 try:
                 try:
                     actions = ActionChains(self.browser)  # 实例化一个action对象
                     actions = ActionChains(self.browser)  # 实例化一个action对象
                     if newTab == 1:  # 在新标签页打开
                     if newTab == 1:  # 在新标签页打开
@@ -1693,7 +1779,11 @@ class BrowserThread(Thread):
                     download_image(self, content, "Data/Task_" +
                     download_image(self, content, "Data/Task_" +
                                    str(self.id) + "/" + self.saveName + "/", element)
                                    str(self.id) + "/" + self.saveName + "/", element)
             else:  # 普通节点
             else:  # 普通节点
-                content = element.text
+                if p["splitLine"] == 1:
+                    text = extract_text_from_html(element.get_attribute('outerHTML'))
+                    content = split_text_by_lines(text)
+                else:
+                    content = element.text
         elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
         elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
             if p["nodeType"] == 2:
             if p["nodeType"] == 2:
                 if element.get_attribute("href") != None:
                 if element.get_attribute("href") != None:
@@ -1830,7 +1920,7 @@ class BrowserThread(Thread):
             self.outputParameters[key] = ""
             self.outputParameters[key] = ""
         self.recordLog("清空输出参数|Clear output parameters")
         self.recordLog("清空输出参数|Clear output parameters")
 
 
-    # 提取数据事件
+    # 提取数据操作
     def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
     def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
         parentPath = replace_field_values(
         parentPath = replace_field_values(
             parentPath, self.outputParameters, self)
             parentPath, self.outputParameters, self)

+ 151 - 1
.temp_to_pub/EasySpider_windows_x64/Code/utils.py

@@ -7,8 +7,11 @@ import sys
 import re
 import re
 import time
 import time
 import uuid
 import uuid
+from bs4 import BeautifulSoup
 # import keyboard
 # import keyboard
 from openpyxl import Workbook, load_workbook
 from openpyxl import Workbook, load_workbook
+# import pandas as pd
+# import xlsxwriter
 import requests
 import requests
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 import pymysql
 import pymysql
@@ -69,6 +72,22 @@ def is_valid_url(url):
 def lowercase_tags_in_xpath(xpath):
 def lowercase_tags_in_xpath(xpath):
     return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
     return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
 
 
+# 提取HTML中的文本内容
+def extract_text_from_html(html_content):
+    soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
+    for script in soup(["script", "style"]): # 去除脚本和样式内容
+        script.extract()
+    for p_tag in soup.find_all("p"):
+        p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
+        p_tag.append("\n") # 在每个p标签后添加换行符
+    text = soup.get_text()
+    return text
+
+# 将文本按照行分割并去除额外空白
+def split_text_by_lines(text):
+    lines = text.splitlines()
+    lines = [line.strip() for line in lines if line.strip()]  # 去除空行和首尾空格
+    return "\n".join(lines)
 
 
 def on_press_creator(press_time, event):
 def on_press_creator(press_time, event):
     def on_press(key):
     def on_press(key):
@@ -137,7 +156,11 @@ def on_release_creator(event, press_time):
 #         time.sleep(1)  # 每秒检查一次
 #         time.sleep(1)  # 每秒检查一次
 
 
 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
-    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
+    try:
+        splitLine = param["splitLine"]
+    except:
+        param["splitLine"] = 0
+    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
         if param["nodeType"] <= 2:
         if param["nodeType"] <= 2:
             if ignoreWaitElement or waitElement == "":
             if ignoreWaitElement or waitElement == "":
                 return True
                 return True
@@ -336,11 +359,115 @@ def write_to_json(file_name, data, types, record, keys):
 
 
 
 
 def write_to_excel(file_name, data, types, record):
 def write_to_excel(file_name, data, types, record):
+    # 首先,检查文件是否存在来决定是否处理第一行
+    # first = not os.path.exists(file_name)
+
+    # # 准备新数据
+    # new_data = pd.DataFrame(data)
+
+    # # 如果不是第一行(即文件已存在),对数据应用类型转换
+    # if not first:
+    #     for i, col_type in enumerate(types):
+    #         if col_type == "int" or col_type == "bigInt":
+    #             try:
+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce').astype(int)
+    #             except:
+    #                 new_data[i] = pd.to_numeric("0", errors='coerce').astype(int)
+    #         elif col_type == "double":
+    #             try:
+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce')(0.0)
+    #             except:
+    #                 new_data[i] = pd.to_numeric("0.0", errors='coerce').astype(float)
+    # # 根据 record 筛选列
+    # new_data = new_data.loc[:, record]
+
+    # # 如果文件存在,则读取现有数据并追加新数据
+    # if first:
+    #     combined_data = new_data
+    # else:
+    #     # 使用 Pandas 读取现有数据
+    #     existing_data = pd.read_excel(file_name)
+    #     # 合并现有数据与新数据
+    #     combined_data = pd.concat([existing_data, new_data], ignore_index=True)
+
+    # # 将合并后的数据写入 Excel
+    # combined_data.to_excel(file_name, index=False, engine='openpyxl')
+
+    # existing_data = []
+    # first = True
+    # # 检查文件是否存在
+    # if os.path.exists(file_name):
+    #     # 使用 openpyxl 读取现有数据
+    #     workbook = load_workbook(file_name, read_only=True)
+    #     sheet = workbook.active
+    #     # 读取已有行数
+    #     num_rows = sheet.max_row
+    #     if num_rows > 5000:
+    #         print("Excel文件中的数据行数超过5000行,过多的行数将会导致追加模式写入数据速度变慢,建议更换为CSV文件或MySQL数据库存储数据。正在读取数据,请稍等...")
+    #         print("The number of rows in the Excel file exceeds 5000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to replace it with CSV file or MySQL database to store data. Reading data, please wait...")
+    #     # existing_data = [[sheet.cell(row=i, column=j).value for j in range(1, sheet.max_column + 1)] for i in range(1, sheet.max_row + 1)]
+    #     for i in range(1, sheet.max_row + 1):
+    #         row_data = []
+    #         if num_rows > 5000 and i % 500 == 0:
+    #             print(f"正在读取第{i}/{num_rows}行的数据...")
+    #             print(f"Reading data of row {i}/{num_rows}...")
+    #         for j in range(1, sheet.max_column + 1):
+    #             cell = sheet.cell(row=i, column=j).value
+    #             if cell is None:
+    #                 cell = ""
+    #             row_data.append(cell)
+    #         existing_data.append(row_data)
+    #     first = False  # 如果文件存在,首行不再是标题行
+
+    # # 使用 xlsxwriter 创建新文件
+    # workbook = xlsxwriter.Workbook(file_name)
+    # worksheet = workbook.add_worksheet()
+
+    # # 写入现有数据
+    # for row_num, row_data in enumerate(existing_data):
+    #     for col_num, cell in enumerate(row_data):
+    #         worksheet.write(row_num, col_num, cell)
+
+    # # 写入新数据
+    # row = len(existing_data)
+    # for line in data:
+    #     to_write = []
+    #     for i in range(len(line)):
+    #         value = line[i]
+    #         if not first:  # 如果不是第一行,需要转换数据类型
+    #             if types[i] == "int" or types[i] == "bigInt":
+    #                 try:
+    #                     value = int(value)
+    #                 except ValueError:
+    #                     value = 0
+    #             elif types[i] == "double":
+    #                 try:
+    #                     value = float(value)
+    #                 except ValueError:
+    #                     value = 0.0
+    #         if record[i]:
+    #             to_write.append(value)
+    #     first = False  # 更新 first 以跳过数据类型转换
+    #     for col, item in enumerate(to_write):
+    #         worksheet.write(row, col, item)
+    #     row += 1
+
+    # # 关闭工作簿
+    # workbook.close()
+
     first = False
     first = False
     if os.path.exists(file_name):
     if os.path.exists(file_name):
         # 加载现有的工作簿
         # 加载现有的工作簿
         wb = load_workbook(file_name)
         wb = load_workbook(file_name)
+        # 行数读取
+        num_rows = wb.active.max_row
+        if num_rows > 1000:
+            print("Excel文件中的数据行数已超过1000行,过多的行数将会导致追加模式写入数据速度变慢,建议增大任务保存对话框中的“每采集多少条数据保存一次”选项的值以提升采集速度,或者更换为CSV文件或MySQL数据库存储数据。正在读取数据,请稍等...")
+            print("The number of rows in the Excel file already exceeds 1000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to increase the value of the 'Save every how many data' option in the task save dialog to improve the collection speed, or replace it with CSV file or MySQL database to store data. Reading data, please wait...")
         ws = wb.active
         ws = wb.active
+        if num_rows > 1000:
+            print("读取数据完成,正在追加数据...")
+            print("Reading data completed, appending data...")
     else:
     else:
         # 创建新的工作簿和工作表
         # 创建新的工作簿和工作表
         wb = Workbook()
         wb = Workbook()
@@ -433,6 +560,10 @@ class myMySQL:
         sql = "CREATE TABLE " + table_name + \
         sql = "CREATE TABLE " + table_name + \
             " (_id INT AUTO_INCREMENT PRIMARY KEY, "
             " (_id INT AUTO_INCREMENT PRIMARY KEY, "
         for item in parameters:
         for item in parameters:
+            try:
+                recordASField = item["recordASField"]
+            except:
+                item["recordASField"] = True
             if item["recordASField"]:
             if item["recordASField"]:
                 name = item['name']
                 name = item['name']
                 if item['type'] == 'int':
                 if item['type'] == 'int':
@@ -546,6 +677,25 @@ class myMySQL:
         # 关闭游标和连接
         # 关闭游标和连接
         self.cursor.close()
         self.cursor.close()
 
 
+    def remove_duplicate_data(self):
+        self.cursor = self.conn.cursor()
+        # 删除重复数据
+        fields = self.field_sql.replace("(", "").replace(")", "")
+        sql = f"CREATE TABLE {self.table_name}_temp AS " + \
+        f"SELECT MIN(_id) AS _id, " + fields + \
+        f" FROM {self.table_name} GROUP BY " + fields + ";"
+        self.cursor.execute(sql)
+        sql = f"DELETE FROM {self.table_name};"
+        self.cursor.execute(sql)
+        sql = f"INSERT INTO {self.table_name} SELECT * FROM {self.table_name}_temp;"
+        self.cursor.execute(sql)
+        sql = f"DROP TABLE {self.table_name}_temp;"
+        self.cursor.execute(sql)
+        # 提交到数据库执行
+        self.conn.commit()
+        # 关闭游标和连接
+        self.cursor.close()
+
     def close(self):
     def close(self):
         try:
         try:
             self.conn.close()
             self.conn.close()

File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/0.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/1.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/149.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/213.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/296.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/8.json


+ 1 - 0
.temp_to_pub/compress.py

@@ -31,6 +31,7 @@ def compress_folder_to_7z(folder_path, output_file):
     #     archive.writeall(folder_path, output_file)
     #     archive.writeall(folder_path, output_file)
     # 压缩文件夹
     # 压缩文件夹
     try:
     try:
+        # "-mmt4"表示使用4个线程压缩
         subprocess.call(["7z", "a", output_file, folder_path])
         subprocess.call(["7z", "a", output_file, folder_path])
     except:
     except:
         subprocess.call(["7za", "a", output_file, folder_path])
         subprocess.call(["7za", "a", output_file, folder_path])

BIN
ElectronJS/EasySpider_en.crx


BIN
ElectronJS/EasySpider_zh.crx


+ 1 - 0
ElectronJS/clean_and_release_win32.cmd

@@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x32\Code\.vscode /E /I /Y
 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x32\Code\.vscode /E /I /Y
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\user_data
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\user_data
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
+rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x32\TempUserDataFolder
 mkdir ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
 mkdir ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Data
 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Data
 mkdir ..\.temp_to_pub\EasySpider_windows_x32\Data
 mkdir ..\.temp_to_pub\EasySpider_windows_x32\Data

+ 1 - 0
ElectronJS/clean_and_release_win64.cmd

@@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x64\Code\.vscode /E /I /Y
 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x64\Code\.vscode /E /I /Y
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\user_data
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\user_data
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
+rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\TempUserDataFolder
 mkdir ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
 mkdir ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Data
 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Data
 mkdir ..\.temp_to_pub\EasySpider_windows_x64\Data
 mkdir ..\.temp_to_pub\EasySpider_windows_x64\Data

+ 7 - 1
ElectronJS/main.js

@@ -950,13 +950,19 @@ async function runBrowser(lang = "en", user_data_folder = '', mobile = false) {
     await cdpConnection.execute('Page.addScriptToEvaluateOnNewDocument', {
     await cdpConnection.execute('Page.addScriptToEvaluateOnNewDocument', {
         source: stealth,
         source: stealth,
     });
     });
+    if (config_context.user_data_folder == "") {
+        //调整浏览器窗口大小
+        let size = await driver.manage().window().getRect();
+        let width = size.width;
+        let height = size.height;
+        await driver.manage().window().setRect({width: width * 1.2, height: height});
+    }
     try {
     try {
         if (mobile) {
         if (mobile) {
             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&mobile=1&lang=" + lang);
             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&mobile=1&lang=" + lang);
         } else {
         } else {
             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&lang=" + lang);
             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&lang=" + lang);
         }
         }
-
         old_handles = await driver.getAllWindowHandles();
         old_handles = await driver.getAllWindowHandles();
         current_handle = old_handles[old_handles.length - 1];
         current_handle = old_handles[old_handles.length - 1];
     } finally {
     } finally {

+ 1 - 0
ElectronJS/package_linux64.sh

@@ -14,6 +14,7 @@ rm -rf out/EasySpider/resources/app/.idea
 rm -rf out/EasySpider/resources/app/tasks
 rm -rf out/EasySpider/resources/app/tasks
 rm -rf out/EasySpider/resources/app/execution_instances
 rm -rf out/EasySpider/resources/app/execution_instances
 rm -rf out/EasySpider/resources/app/user_data
 rm -rf out/EasySpider/resources/app/user_data
+rm -rf out/EasySpider/resources/app/TempUserDataFolder
 rm -rf ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
 rm -rf ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
 rm out/EasySpider/resources/app/vs_BuildTools.exe
 rm out/EasySpider/resources/app/vs_BuildTools.exe
 mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
 mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider

+ 1 - 0
ElectronJS/package_macos.sh

@@ -17,6 +17,7 @@ rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resource
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/tasks
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/tasks
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/execution_instances
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/execution_instances
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/user_data
 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/user_data
+rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/TempUserDataFolder
 rm -rf ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
 rm -rf ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
 mkdir ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
 mkdir ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
 cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
 cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code

+ 1 - 1
ElectronJS/src/taskGrid/FlowChart.html

@@ -651,7 +651,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
                     </div>
                     </div>
                     <div v-else-if='TClass == 7'>
                     <div v-else-if='TClass == 7'>
                         <label>Code/Script Content (<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">Click here</a> for more examples): </label>
                         <label>Code/Script Content (<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">Click here</a> for more examples): </label>
-                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length >= 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
+                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length > 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
                         <label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
                         <label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                     </div>
                     </div>

+ 1 - 1
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -651,7 +651,7 @@ print(emotlib.emoji()) # 使用其中的函数。
                     </div>
                     </div>
                     <div v-else-if='TClass == 7'>
                     <div v-else-if='TClass == 7'>
                         <label>代码/脚本内容(<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例): </label>
                         <label>代码/脚本内容(<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例): </label>
-                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.length >=5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
+                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.length >5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
                         <label>最长等待脚本执行时间(0代表无限等待): </label>
                         <label>最长等待脚本执行时间(0代表无限等待): </label>
                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                     </div>
                     </div>

+ 1 - 1
ElectronJS/src/taskGrid/executeTask.html

@@ -10,7 +10,7 @@
     <script src="vue.js"></script>
     <script src="vue.js"></script>
     <script src="bootstrap/js/bootstrap.js"></script>
     <script src="bootstrap/js/bootstrap.js"></script>
     <link href="bootstrap/css/bootstrap.css" rel="stylesheet"></link>
     <link href="bootstrap/css/bootstrap.css" rel="stylesheet"></link>
-    <title>任务执行 | Task Execute</title>
+    <title>任务执行 | Task Execution</title>
     <style>
     <style>
         table {
         table {
             table-layout: auto;
             table-layout: auto;

+ 1 - 1
ExecuteStage/.vscode/launch.json

@@ -12,7 +12,7 @@
             "justMyCode": false,
             "justMyCode": false,
             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--ids", "[40]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
+            "args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
         "--read_type", "remote"]
         "--read_type", "remote"]
             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
         }
         }

+ 2 - 2
ExecuteStage/easyspider_executestage.py

@@ -513,8 +513,8 @@ class BrowserThread(Thread):
             self.urlId = self.urlId + 1
             self.urlId = self.urlId + 1
         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         # 如果目录为空,则删除该目录
         # 如果目录为空,则删除该目录
-        if not files:
-            os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
+        # if not files:
+        #     os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
         self.print_and_log("Done!")
         self.print_and_log("Done!")
         self.print_and_log("执行完成!")
         self.print_and_log("执行完成!")
         self.saveData(exit=True)
         self.saveData(exit=True)

+ 4 - 0
ExecuteStage/utils.py

@@ -156,6 +156,10 @@ def on_release_creator(event, press_time):
 #         time.sleep(1)  # 每秒检查一次
 #         time.sleep(1)  # 每秒检查一次
 
 
 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
+    try:
+        splitLine = param["splitLine"]
+    except:
+        param["splitLine"] = 0
     if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
     if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
         if param["nodeType"] <= 2:
         if param["nodeType"] <= 2:
             if ignoreWaitElement or waitElement == "":
             if ignoreWaitElement or waitElement == "":

Some files were not shown because too many files changed in this diff