1 year ago · ed0768ca51
--- a/.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py
+++ b/.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py
@@ -6,8 +6,8 @@ import platform
 
															 import shutil
														
 
															 import string
														
 
															 import undetected_chromedriver as uc
														
 
															-from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
														
 
															-    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
														
 
															+from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
														
 
															+    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
														
 
															 from myChrome import MyChrome
														
 
															 from threading import Thread, Event
														
 
															 from PIL import Image
														
@@ -47,10 +47,11 @@ import requests
 
															 from ddddocr import DdddOcr
														
 
															 from urllib.parse import urljoin
														
 
															 from lxml import etree, html
														
 
															+
														
 
															 import onnxruntime
														
 
															 onnxruntime.set_default_logger_severity(3)  # 隐藏onnxruntime的日志
														
 
															-# import pandas as pd
														
 
															+import pandas as pd
														
 
															 # import numpy
														
 
															 # import pytesseract
														
 
															 # import uuid
														
@@ -295,9 +296,13 @@ class BrowserThread(Thread):
 
															                     except:
														
 
															                         pass
														
 
															                     try:
														
 
															-                        node["parameters"]["recordASField"] += param["recordASField"]
														
 
															+                        node["parameters"]["recordASField"] = param["recordASField"]
														
 
															+                    except:
														
 
															+                        node["parameters"]["recordASField"] = 1
														
 
															+                    try:
														
 
															+                        splitLine = int(param["splitLine"])
														
 
															                     except:
														
 
															-                        node["parameters"]["recordASField"] += 1
														
 
															+                        param["splitLine"] = 0
														
 
															                     if param["contentType"] == 8:
														
 
															                         self.print_and_log(
														
 
															                             "默认的ddddocr识别功能如果觉得不好用，可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行；或者可以先设置采集内容类型为“元素截图”把图片保存下来，然后用自定义操作调用自己写的程序，程序的功能是读取这个最新生成的图片，然后用好用的模型，如PaddleOCR把图片识别出来，然后把返回值返回给程序作为参数输出。")
														
@@ -333,6 +338,10 @@ class BrowserThread(Thread):
 
															                 except:
														
 
															                     node["parameters"]["exitElement"] = "//body"
														
 
															                 node["parameters"]["quickExtractable"] = False # 是否可以快速提取
														
 
															+                try:
														
 
															+                    skipCount = node["parameters"]["skipCount"]
														
 
															+                except:
														
 
															+                    node["parameters"]["skipCount"] = 0
														
 
															                 # 如果（不）固定元素列表循环中只有一个提取数据操作，且提取数据操作的提取内容为元素截图，那么可以快速提取
														
 
															                 if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
														
 
															                     try:
														
@@ -347,6 +356,8 @@ class BrowserThread(Thread):
 
															                         node["parameters"]["quickExtractable"] = False # 如果是iframe，那么不可以快速提取
														
 
															                     else:
														
 
															                         node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
														
 
															+                    if node["parameters"]["skipCount"] > 0:
														
 
															+                        node["parameters"]["quickExtractable"] = False # 如果有跳过的元素，那么不可以快速提取
														
 
															                     for param in params:
														
 
															                         optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
														
 
															                         try:
														
@@ -463,21 +474,51 @@ class BrowserThread(Thread):
 
															         self.print_and_log(
														
 
															             "Already read input parameters from Excel and overwrite the original input parameters.")
														
 
															+    def removeDuplicateData(self):
														
 
															+        try:
														
 
															+            removeDuplicateData = self.service["removeDuplicate"]
														
 
															+        except:
														
 
															+            removeDuplicateData = 0
														
 
															+        if removeDuplicateData == 1:
														
 
															+            self.print_and_log("正在去除重复数据，请稍后……")
														
 
															+            self.print_and_log("Removing duplicate data, please wait...")
														
 
															+            if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "json" or self.outputFormat == "xlsx":
														
 
															+                file_name = "Data/Task_" + \
														
 
															+                            str(self.id) + "/" + self.saveName + \
														
 
															+                            '.' + self.outputFormat
														
 
															+                if self.outputFormat == "csv" or self.outputFormat == "txt":
														
 
															+                    df = pd.read_csv(file_name)
														
 
															+                    df.drop_duplicates(inplace=True)
														
 
															+                    df.to_csv(file_name, index=False)
														
 
															+                elif self.outputFormat == "xlsx":
														
 
															+                    df = pd.read_excel(file_name)
														
 
															+                    df.drop_duplicates(inplace=True)
														
 
															+                    df.to_excel(file_name, index=False)
														
 
															+                elif self.outputFormat == "json":
														
 
															+                    df = pd.read_json(file_name)
														
 
															+                    df.drop_duplicates(inplace=True)
														
 
															+                    df.to_json(file_name, orient="records", force_ascii=False)
														
 
															+            elif self.outputFormat == "mysql":
														
 
															+                self.mysql.remove_duplicate_data()
														
 
															+            self.print_and_log("去重完成。")
														
 
															+            self.print_and_log("Duplicate data removed.")
														
 
															+
														
 
															     def run(self):
														
 
															         # 挨个执行程序
														
 
															         for i in range(len(self.links)):
														
 
															-            self.print_and_log("正在执行第", i + 1, "/ ", len(self.links), "个链接")
														
 
															+            self.print_and_log("正在执行第", i + 1, "/", len(self.links), "个链接")
														
 
															             self.print_and_log("Executing link", i + 1,
														
 
															-                               "/ ", len(self.links))
														
 
															+                               "/", len(self.links))
														
 
															             self.executeNode(0)
														
 
															             self.urlId = self.urlId + 1
														
 
															         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															         # 如果目录为空，则删除该目录
														
 
															-        if not files:
														
 
															-            os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															+        # if not files:
														
 
															+        #     os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															         self.print_and_log("Done!")
														
 
															         self.print_and_log("执行完成！")
														
 
															         self.saveData(exit=True)
														
 
															+        self.removeDuplicateData()
														
 
															         if self.outputFormat == "mysql":
														
 
															             self.mysql.close()
														
 
															         try:
														
@@ -1115,10 +1156,18 @@ class BrowserThread(Thread):
 
															                     if node["parameters"]["exitCount"] == 0:
														
 
															                         # newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
														
 
															                         # 用find_elements获取所有匹配到的文本
														
 
															-                        exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
														
 
															-                        newBodyText = ""
														
 
															-                        for exitElement in exitElements:
														
 
															-                            newBodyText += exitElement.text
														
 
															+                        try:
														
 
															+                            exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
														
 
															+                            newBodyText = ""
														
 
															+                            for exitElement in exitElements:
														
 
															+                                newBodyText += exitElement.text
														
 
															+                        except Exception as e:
														
 
															+                            self.print_and_log(f"设定的退出循环元素：{node['parameters']['exitElement']}的文本无法获取，本次循环将不再检测元素文本是否变化，将会继续执行，为解决此问题，您可以修改检测元素文本不变的元素为其他元素，或者将循环次数设定为固定次数大于0的值。")
														
 
															+                            self.print_and_log(f"The text of the exit loop element set: {node['parameters']['exitElement']} cannot be obtained, this loop will no longer check whether the text of the element has changed, and will continue to execute. To solve this problem, you can modify the element whose text does not change to other elements, or set the number of loops to a fixed number greater than 0.")
														
 
															+                            self.print_and_log(e)
														
 
															+                            exitElements = []
														
 
															+                            # newBodyText为随机文本，保证一直执行
														
 
															+                            newBodyText = str(random.random())
														
 
															                         if node["parameters"]["iframe"]:  # 如果标记了iframe
														
 
															                             iframes = self.browser.find_elements(
														
 
															                                 By.CSS_SELECTOR, "iframe", iframe=False)
														
@@ -1200,9 +1249,15 @@ class BrowserThread(Thread):
 
															                 if len(elements) == 0:
														
 
															                     self.print_and_log("Loop element not found: ",
														
 
															                                        xpath)
														
 
															-                    self.print_and_log("找不到循环元素: ", xpath)
														
 
															+                    self.print_and_log("找不到循环元素：", xpath)
														
 
															                 index = 0
														
 
															+                skipCount = node["parameters"]["skipCount"]
														
 
															                 while index < len(elements):
														
 
															+                    if index < skipCount:
														
 
															+                        index += 1
														
 
															+                        self.print_and_log("跳过第" + str(index) + "个元素")
														
 
															+                        self.print_and_log("Skip the " + str(index) + "th element")
														
 
															+                        continue
														
 
															                     try:
														
 
															                         element = elements[index]
														
 
															                         element_text = element.text
														
@@ -1250,7 +1305,7 @@ class BrowserThread(Thread):
 
															                     index = index + 1
														
 
															             except NoSuchElementException:
														
 
															                 self.print_and_log("Loop element not found: ", xpath)
														
 
															-                self.print_and_log("找不到循环元素: ", xpath)
														
 
															+                self.print_and_log("找不到循环元素：", xpath)
														
 
															             except Exception as e:
														
 
															                 raise
														
 
															         elif int(node["parameters"]["loopType"]) == 2:  # 固定元素列表
														
@@ -1258,7 +1313,13 @@ class BrowserThread(Thread):
 
															             paths = node["parameters"]["pathList"].split("\n")
														
 
															             # for path in node["parameters"]["pathList"].split("\n"):
														
 
															             index = 0
														
 
															+            skipCount = node["parameters"]["skipCount"]
														
 
															             while index < len(paths):
														
 
															+                if index < skipCount:
														
 
															+                    index += 1
														
 
															+                    self.print_and_log("跳过第" + str(index) + "个元素")
														
 
															+                    self.print_and_log("Skip the " + str(index) + "th element")
														
 
															+                    continue
														
 
															                 path = paths[index]
														
 
															                 try:
														
 
															                     path = replace_field_values(
														
@@ -1295,7 +1356,7 @@ class BrowserThread(Thread):
 
															                     index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
														
 
															                 except NoSuchElementException:
														
 
															                     self.print_and_log("Loop element not found: ", path)
														
 
															-                    self.print_and_log("找不到循环元素: ", path)
														
 
															+                    self.print_and_log("找不到循环元素：", path)
														
 
															                     index += 1
														
 
															                     continue  # 循环中找不到元素就略过操作
														
 
															                 except Exception as e:
														
@@ -1314,7 +1375,14 @@ class BrowserThread(Thread):
 
															             if len(textList) == 1:  # 如果固定文本列表只有一行，现在就可以替换变量
														
 
															                 textList = replace_field_values(
														
 
															                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
														
 
															+            skipCount = node["parameters"]["skipCount"]
														
 
															+            index = 0
														
 
															             for text in textList:
														
 
															+                if index < skipCount:
														
 
															+                    index += 1
														
 
															+                    self.print_and_log("跳过第" + str(index) + "个文本")
														
 
															+                    self.print_and_log("Skip the " + str(index) + "th text")
														
 
															+                    continue
														
 
															                 text = replace_field_values(text, self.outputParameters, self)
														
 
															                 # self.recordLog("当前循环文本|Current loop text:", text)
														
 
															                 for i in node["sequence"]:  # 挨个执行操作
														
@@ -1340,11 +1408,14 @@ class BrowserThread(Thread):
 
															             if len(urlList) == 1:  # 如果固定网址列表只有一行，现在就可以替换变量
														
 
															                 urlList = replace_field_values(
														
 
															                     node["parameters"]["textList"], self.outputParameters, self).split("\n")
														
 
															-            # urlList = []
														
 
															-            # for url in tempList:
														
 
															-            #     if url != "":
														
 
															-            #         urlList.append(url)
														
 
															+            skipCount = node["parameters"]["skipCount"]
														
 
															+            index = 0
														
 
															             for url in urlList:
														
 
															+                if index < skipCount:
														
 
															+                    index += 1
														
 
															+                    self.print_and_log("跳过第" + str(index) + "个网址")
														
 
															+                    self.print_and_log("Skip the " + str(index) + "th url")
														
 
															+                    continue
														
 
															                 url = replace_field_values(url, self.outputParameters, self)
														
 
															                 # self.recordLog("当前循环网址|Current loop url:", url)
														
 
															                 for i in node["sequence"]:
														
@@ -1392,7 +1463,7 @@ class BrowserThread(Thread):
 
															         self.history["handle"] = self.browser.current_window_handle
														
 
															         self.scrollDown(node["parameters"])
														
 
															-    # 打开网页事件
														
 
															+    # 打开网页操作
														
 
															     def openPage(self, param, loopValue):
														
 
															         time.sleep(1)  # 打开网页后强行等待至少1秒
														
 
															         if len(self.browser.window_handles) > 1:
														
@@ -1457,7 +1528,7 @@ class BrowserThread(Thread):
 
															             self.history["index"] = 0
														
 
															         self.scrollDown(param)  # 控制屏幕向下滚动
														
 
															-    # 键盘输入事件
														
 
															+    # 键盘输入操作
														
 
															     def inputInfo(self, param, loopValue):
														
 
															         time.sleep(0.1)  # 输入之前等待0.1秒
														
 
															         try:
														
@@ -1509,7 +1580,7 @@ class BrowserThread(Thread):
 
															                                xpath + ", please try to set the wait time before executing this operation")
														
 
															             self.print_and_log("找不到输入框元素:" + xpath + "，请尝试在执行此操作前设置等待时间")
														
 
															-    # 点击元素事件
														
 
															+    # 点击元素操作
														
 
															     def clickElement(self, param, loopElement=None, clickPath="", index=0):
														
 
															         try:
														
 
															             maxWaitTime = int(param["maxWaitTime"])
														
@@ -1525,7 +1596,10 @@ class BrowserThread(Thread):
 
															                 clickPath, self.outputParameters, self)
														
 
															             xpath = replace_field_values(
														
 
															                 param["xpath"], self.outputParameters, self)
														
 
															-            if param["useLoop"]:  # 使用循环的情况下，传入的clickPath就是实际的xpath
														
 
															+            if xpath.find("point(") >= 0:  # 如果xpath中包含point()，说明是相对坐标的点击
														
 
															+                index = 0
														
 
															+                path = "//body"
														
 
															+            elif param["useLoop"]:  # 使用循环的情况下，传入的clickPath就是实际的xpath
														
 
															                 if xpath == "":
														
 
															                     path = clickPath
														
 
															                 else:
														
@@ -1557,9 +1631,21 @@ class BrowserThread(Thread):
 
															         try:
														
 
															             newTab = int(param["newTab"])
														
 
															         except:
														
 
															-            newTab = 1
														
 
															+            newTab = 0
														
 
															         try:
														
 
															-            if click_way == 0:  # 用selenium的点击方法
														
 
															+            if xpath.find("point(") >= 0:  # 如果xpath中包含point()，说明是相对坐标的点击
														
 
															+                point = xpath.split("point(")[1].split(")")[0].split(",")
														
 
															+                x = int(point[0])
														
 
															+                y = int(point[1])
														
 
															+                # try:
														
 
															+                #     actions = ActionChains(self.browser)  # 实例化一个action对象
														
 
															+                #     actions.move_to_element(element).perform()
														
 
															+                #     actions.move_by_offset(x, y).perform()
														
 
															+                #     actions.click().perform()
														
 
															+                # except Exception as e:
														
 
															+                script = "document.elementFromPoint(" + str(x) + "," + str(y) + ").click();"
														
 
															+                self.browser.execute_script(script)
														
 
															+            elif click_way == 0:  # 用selenium的点击方法
														
 
															                 try:
														
 
															                     actions = ActionChains(self.browser)  # 实例化一个action对象
														
 
															                     if newTab == 1:  # 在新标签页打开
														
@@ -1693,7 +1779,11 @@ class BrowserThread(Thread):
 
															                     download_image(self, content, "Data/Task_" +
														
 
															                                    str(self.id) + "/" + self.saveName + "/", element)
														
 
															             else:  # 普通节点
														
 
															-                content = element.text
														
 
															+                if p["splitLine"] == 1:
														
 
															+                    text = extract_text_from_html(element.get_attribute('outerHTML'))
														
 
															+                    content = split_text_by_lines(text)
														
 
															+                else:
														
 
															+                    content = element.text
														
 
															         elif p["contentType"] == 1:  # 只采集当期元素下的文本，不包括子元素
														
 
															             if p["nodeType"] == 2:
														
 
															                 if element.get_attribute("href") != None:
														
@@ -1830,7 +1920,7 @@ class BrowserThread(Thread):
 
															             self.outputParameters[key] = ""
														
 
															         self.recordLog("清空输出参数|Clear output parameters")
														
 
															-    # 提取数据事件
														
 
															+    # 提取数据操作
														
 
															     def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
														
 
															         parentPath = replace_field_values(
														
 
															             parentPath, self.outputParameters, self)
														
--- a/.temp_to_pub/EasySpider_windows_x64/Code/utils.py
+++ b/.temp_to_pub/EasySpider_windows_x64/Code/utils.py
@@ -7,8 +7,11 @@ import sys
 
															 import re
														
 
															 import time
														
 
															 import uuid
														
 
															+from bs4 import BeautifulSoup
														
 
															 # import keyboard
														
 
															 from openpyxl import Workbook, load_workbook
														
 
															+# import pandas as pd
														
 
															+# import xlsxwriter
														
 
															 import requests
														
 
															 from urllib.parse import urlparse
														
 
															 import pymysql
														
@@ -69,6 +72,22 @@ def is_valid_url(url):
 
															 def lowercase_tags_in_xpath(xpath):
														
 
															     return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
														
 
															+# 提取HTML中的文本内容
														
 
															+def extract_text_from_html(html_content):
														
 
															+    soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
														
 
															+    for script in soup(["script", "style"]): # 去除脚本和样式内容
														
 
															+        script.extract()
														
 
															+    for p_tag in soup.find_all("p"):
														
 
															+        p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
														
 
															+        p_tag.append("\n") # 在每个p标签后添加换行符
														
 
															+    text = soup.get_text()
														
 
															+    return text
														
 
															+
														
 
															+# 将文本按照行分割并去除额外空白
														
 
															+def split_text_by_lines(text):
														
 
															+    lines = text.splitlines()
														
 
															+    lines = [line.strip() for line in lines if line.strip()]  # 去除空行和首尾空格
														
 
															+    return "\n".join(lines)
														
 
															 def on_press_creator(press_time, event):
														
 
															     def on_press(key):
														
@@ -137,7 +156,11 @@ def on_release_creator(event, press_time):
 
															 #         time.sleep(1)  # 每秒检查一次
														
 
															 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
														
 
															-    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
														
 
															+    try:
														
 
															+        splitLine = param["splitLine"]
														
 
															+    except:
														
 
															+        param["splitLine"] = 0
														
 
															+    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
														
 
															         if param["nodeType"] <= 2:
														
 
															             if ignoreWaitElement or waitElement == "":
														
 
															                 return True
														
@@ -336,11 +359,115 @@ def write_to_json(file_name, data, types, record, keys):
 
															 def write_to_excel(file_name, data, types, record):
														
 
															+    # 首先，检查文件是否存在来决定是否处理第一行
														
 
															+    # first = not os.path.exists(file_name)
														
 
															+
														
 
															+    # # 准备新数据
														
 
															+    # new_data = pd.DataFrame(data)
														
 
															+
														
 
															+    # # 如果不是第一行（即文件已存在），对数据应用类型转换
														
 
															+    # if not first:
														
 
															+    #     for i, col_type in enumerate(types):
														
 
															+    #         if col_type == "int" or col_type == "bigInt":
														
 
															+    #             try:
														
 
															+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce').astype(int)
														
 
															+    #             except:
														
 
															+    #                 new_data[i] = pd.to_numeric("0", errors='coerce').astype(int)
														
 
															+    #         elif col_type == "double":
														
 
															+    #             try:
														
 
															+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce')(0.0)
														
 
															+    #             except:
														
 
															+    #                 new_data[i] = pd.to_numeric("0.0", errors='coerce').astype(float)
														
 
															+    # # 根据 record 筛选列
														
 
															+    # new_data = new_data.loc[:, record]
														
 
															+
														
 
															+    # # 如果文件存在，则读取现有数据并追加新数据
														
 
															+    # if first:
														
 
															+    #     combined_data = new_data
														
 
															+    # else:
														
 
															+    #     # 使用 Pandas 读取现有数据
														
 
															+    #     existing_data = pd.read_excel(file_name)
														
 
															+    #     # 合并现有数据与新数据
														
 
															+    #     combined_data = pd.concat([existing_data, new_data], ignore_index=True)
														
 
															+
														
 
															+    # # 将合并后的数据写入 Excel
														
 
															+    # combined_data.to_excel(file_name, index=False, engine='openpyxl')
														
 
															+
														
 
															+    # existing_data = []
														
 
															+    # first = True
														
 
															+    # # 检查文件是否存在
														
 
															+    # if os.path.exists(file_name):
														
 
															+    #     # 使用 openpyxl 读取现有数据
														
 
															+    #     workbook = load_workbook(file_name, read_only=True)
														
 
															+    #     sheet = workbook.active
														
 
															+    #     # 读取已有行数
														
 
															+    #     num_rows = sheet.max_row
														
 
															+    #     if num_rows > 5000:
														
 
															+    #         print("Excel文件中的数据行数超过5000行，过多的行数将会导致追加模式写入数据速度变慢，建议更换为CSV文件或MySQL数据库存储数据。正在读取数据，请稍等...")
														
 
															+    #         print("The number of rows in the Excel file exceeds 5000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to replace it with CSV file or MySQL database to store data. Reading data, please wait...")
														
 
															+    #     # existing_data = [[sheet.cell(row=i, column=j).value for j in range(1, sheet.max_column + 1)] for i in range(1, sheet.max_row + 1)]
														
 
															+    #     for i in range(1, sheet.max_row + 1):
														
 
															+    #         row_data = []
														
 
															+    #         if num_rows > 5000 and i % 500 == 0:
														
 
															+    #             print(f"正在读取第{i}/{num_rows}行的数据...")
														
 
															+    #             print(f"Reading data of row {i}/{num_rows}...")
														
 
															+    #         for j in range(1, sheet.max_column + 1):
														
 
															+    #             cell = sheet.cell(row=i, column=j).value
														
 
															+    #             if cell is None:
														
 
															+    #                 cell = ""
														
 
															+    #             row_data.append(cell)
														
 
															+    #         existing_data.append(row_data)
														
 
															+    #     first = False  # 如果文件存在，首行不再是标题行
														
 
															+
														
 
															+    # # 使用 xlsxwriter 创建新文件
														
 
															+    # workbook = xlsxwriter.Workbook(file_name)
														
 
															+    # worksheet = workbook.add_worksheet()
														
 
															+
														
 
															+    # # 写入现有数据
														
 
															+    # for row_num, row_data in enumerate(existing_data):
														
 
															+    #     for col_num, cell in enumerate(row_data):
														
 
															+    #         worksheet.write(row_num, col_num, cell)
														
 
															+
														
 
															+    # # 写入新数据
														
 
															+    # row = len(existing_data)
														
 
															+    # for line in data:
														
 
															+    #     to_write = []
														
 
															+    #     for i in range(len(line)):
														
 
															+    #         value = line[i]
														
 
															+    #         if not first:  # 如果不是第一行，需要转换数据类型
														
 
															+    #             if types[i] == "int" or types[i] == "bigInt":
														
 
															+    #                 try:
														
 
															+    #                     value = int(value)
														
 
															+    #                 except ValueError:
														
 
															+    #                     value = 0
														
 
															+    #             elif types[i] == "double":
														
 
															+    #                 try:
														
 
															+    #                     value = float(value)
														
 
															+    #                 except ValueError:
														
 
															+    #                     value = 0.0
														
 
															+    #         if record[i]:
														
 
															+    #             to_write.append(value)
														
 
															+    #     first = False  # 更新 first 以跳过数据类型转换
														
 
															+    #     for col, item in enumerate(to_write):
														
 
															+    #         worksheet.write(row, col, item)
														
 
															+    #     row += 1
														
 
															+
														
 
															+    # # 关闭工作簿
														
 
															+    # workbook.close()
														
 
															+
														
 
															     first = False
														
 
															     if os.path.exists(file_name):
														
 
															         # 加载现有的工作簿
														
 
															         wb = load_workbook(file_name)
														
 
															+        # 行数读取
														
 
															+        num_rows = wb.active.max_row
														
 
															+        if num_rows > 1000:
														
 
															+            print("Excel文件中的数据行数已超过1000行，过多的行数将会导致追加模式写入数据速度变慢，建议增大任务保存对话框中的“每采集多少条数据保存一次”选项的值以提升采集速度，或者更换为CSV文件或MySQL数据库存储数据。正在读取数据，请稍等...")
														
 
															+            print("The number of rows in the Excel file already exceeds 1000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to increase the value of the 'Save every how many data' option in the task save dialog to improve the collection speed, or replace it with CSV file or MySQL database to store data. Reading data, please wait...")
														
 
															         ws = wb.active
														
 
															+        if num_rows > 1000:
														
 
															+            print("读取数据完成，正在追加数据...")
														
 
															+            print("Reading data completed, appending data...")
														
 
															     else:
														
 
															         # 创建新的工作簿和工作表
														
 
															         wb = Workbook()
														
@@ -433,6 +560,10 @@ class myMySQL:
 
															         sql = "CREATE TABLE " + table_name + \
														
 
															             " (_id INT AUTO_INCREMENT PRIMARY KEY, "
														
 
															         for item in parameters:
														
 
															+            try:
														
 
															+                recordASField = item["recordASField"]
														
 
															+            except:
														
 
															+                item["recordASField"] = True
														
 
															             if item["recordASField"]:
														
 
															                 name = item['name']
														
 
															                 if item['type'] == 'int':
														
@@ -546,6 +677,25 @@ class myMySQL:
 
															         # 关闭游标和连接
														
 
															         self.cursor.close()
														
 
															+    def remove_duplicate_data(self):
														
 
															+        self.cursor = self.conn.cursor()
														
 
															+        # 删除重复数据
														
 
															+        fields = self.field_sql.replace("(", "").replace(")", "")
														
 
															+        sql = f"CREATE TABLE {self.table_name}_temp AS " + \
														
 
															+        f"SELECT MIN(_id) AS _id, " + fields + \
														
 
															+        f" FROM {self.table_name} GROUP BY " + fields + ";"
														
 
															+        self.cursor.execute(sql)
														
 
															+        sql = f"DELETE FROM {self.table_name};"
														
 
															+        self.cursor.execute(sql)
														
 
															+        sql = f"INSERT INTO {self.table_name} SELECT * FROM {self.table_name}_temp;"
														
 
															+        self.cursor.execute(sql)
														
 
															+        sql = f"DROP TABLE {self.table_name}_temp;"
														
 
															+        self.cursor.execute(sql)
														
 
															+        # 提交到数据库执行
														
 
															+        self.conn.commit()
														
 
															+        # 关闭游标和连接
														
 
															+        self.cursor.close()
														
 
															+
														
 
															     def close(self):
														
 
															         try:
														
 
															             self.conn.close()
														
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/0.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/0.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/1.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/1.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/149.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/149.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/213.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/213.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/296.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/296.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/8.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/8.json
--- a/.temp_to_pub/compress.py
+++ b/.temp_to_pub/compress.py
@@ -31,6 +31,7 @@ def compress_folder_to_7z(folder_path, output_file):
 
															     #     archive.writeall(folder_path, output_file)
														
 
															     # 压缩文件夹
														
 
															     try:
														
 
															+        # "-mmt4"表示使用4个线程压缩
														
 
															         subprocess.call(["7z", "a", output_file, folder_path])
														
 
															     except:
														
 
															         subprocess.call(["7za", "a", output_file, folder_path])
														
--- a/ElectronJS/EasySpider_en.crx
+++ b/ElectronJS/EasySpider_en.crx
--- a/ElectronJS/EasySpider_zh.crx
+++ b/ElectronJS/EasySpider_zh.crx
--- a/ElectronJS/clean_and_release_win32.cmd
+++ b/ElectronJS/clean_and_release_win32.cmd
@@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
 
															 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x32\Code\.vscode /E /I /Y
														
 
															 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\user_data
														
 
															 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
														
 
															+rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x32\TempUserDataFolder
														
 
															 mkdir ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
														
 
															 rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Data
														
 
															 mkdir ..\.temp_to_pub\EasySpider_windows_x32\Data
														
--- a/ElectronJS/clean_and_release_win64.cmd
+++ b/ElectronJS/clean_and_release_win64.cmd
@@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
 
															 xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x64\Code\.vscode /E /I /Y
														
 
															 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\user_data
														
 
															 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
														
 
															+rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\TempUserDataFolder
														
 
															 mkdir ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
														
 
															 rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Data
														
 
															 mkdir ..\.temp_to_pub\EasySpider_windows_x64\Data
														
--- a/ElectronJS/main.js
+++ b/ElectronJS/main.js
@@ -950,13 +950,19 @@ async function runBrowser(lang = "en", user_data_folder = '', mobile = false) {
 
															     await cdpConnection.execute('Page.addScriptToEvaluateOnNewDocument', {
														
 
															         source: stealth,
														
 
															     });
														
 
															+    if (config_context.user_data_folder == "") {
														
 
															+        //调整浏览器窗口大小
														
 
															+        let size = await driver.manage().window().getRect();
														
 
															+        let width = size.width;
														
 
															+        let height = size.height;
														
 
															+        await driver.manage().window().setRect({width: width * 1.2, height: height});
														
 
															+    }
														
 
															     try {
														
 
															         if (mobile) {
														
 
															             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&mobile=1&lang=" + lang);
														
 
															         } else {
														
 
															             await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&lang=" + lang);
														
 
															         }
														
 
															-
														
 
															         old_handles = await driver.getAllWindowHandles();
														
 
															         current_handle = old_handles[old_handles.length - 1];
														
 
															     } finally {
														
--- a/ElectronJS/package_linux64.sh
+++ b/ElectronJS/package_linux64.sh
@@ -14,6 +14,7 @@ rm -rf out/EasySpider/resources/app/.idea
 
															 rm -rf out/EasySpider/resources/app/tasks
														
 
															 rm -rf out/EasySpider/resources/app/execution_instances
														
 
															 rm -rf out/EasySpider/resources/app/user_data
														
 
															+rm -rf out/EasySpider/resources/app/TempUserDataFolder
														
 
															 rm -rf ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
														
 
															 rm out/EasySpider/resources/app/vs_BuildTools.exe
														
 
															 mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
														
--- a/ElectronJS/package_macos.sh
+++ b/ElectronJS/package_macos.sh
@@ -17,6 +17,7 @@ rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resource
 
															 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/tasks
														
 
															 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/execution_instances
														
 
															 rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/user_data
														
 
															+rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/TempUserDataFolder
														
 
															 rm -rf ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
														
 
															 mkdir ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
														
 
															 cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
														
--- a/ElectronJS/src/taskGrid/FlowChart.html
+++ b/ElectronJS/src/taskGrid/FlowChart.html
@@ -651,7 +651,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
 
															                     </div>
														
 
															                     <div v-else-if='TClass == 7'>
														
 
															                         <label>Code/Script Content (<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">Click here</a> for more examples): </label>
														
 
															-                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length >= 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
														
 
															+                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length > 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
														
 
															                         <label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
														
 
															                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
														
 
															                     </div>
														
--- a/ElectronJS/src/taskGrid/FlowChart_CN.html
+++ b/ElectronJS/src/taskGrid/FlowChart_CN.html
@@ -651,7 +651,7 @@ print(emotlib.emoji()) # 使用其中的函数。
 
															                     </div>
														
 
															                     <div v-else-if='TClass == 7'>
														
 
															                         <label>代码/脚本内容（<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例）： </label>
														
 
															-                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令，该循环项用arguments[0]表示，返回值大于0或为真则执行此分支内操作，否则不执行。如：return arguments[0].innerText.length >=5 即判断当前循环项的文本长度是否大于5，注意要配合循环类型为元素相关（如不固定元素列表）使用。"></textarea>
														
 
															+                        <textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令，该循环项用arguments[0]表示，返回值大于0或为真则执行此分支内操作，否则不执行。如：return arguments[0].innerText.length >5 即判断当前循环项的文本长度是否大于5，注意要配合循环类型为元素相关（如不固定元素列表）使用。"></textarea>
														
 
															                         <label>最长等待脚本执行时间（0代表无限等待）： </label>
														
 
															                         <input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
														
 
															                     </div>
														
--- a/ElectronJS/src/taskGrid/executeTask.html
+++ b/ElectronJS/src/taskGrid/executeTask.html
@@ -10,7 +10,7 @@
 
															     <script src="vue.js"></script>
														
 
															     <script src="bootstrap/js/bootstrap.js"></script>
														
 
															     <link href="bootstrap/css/bootstrap.css" rel="stylesheet"></link>
														
 
															-    <title>任务执行 | Task Execute</title>
														
 
															+    <title>任务执行 | Task Execution</title>
														
 
															     <style>
														
 
															         table {
														
 
															             table-layout: auto;
														
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@@ -12,7 +12,7 @@
 
															             "justMyCode": false,
														
 
															             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
														
 
															             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
														
 
															-            "args": ["--ids", "[40]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
														
 
															+            "args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
														
 
															         "--read_type", "remote"]
														
 
															             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
														
 
															         }
														
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@@ -513,8 +513,8 @@ class BrowserThread(Thread):
 
															             self.urlId = self.urlId + 1
														
 
															         files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															         # 如果目录为空，则删除该目录
														
 
															-        if not files:
														
 
															-            os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															+        # if not files:
														
 
															+        #     os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
														
 
															         self.print_and_log("Done!")
														
 
															         self.print_and_log("执行完成！")
														
 
															         self.saveData(exit=True)
														
--- a/ExecuteStage/utils.py
+++ b/ExecuteStage/utils.py
@@ -156,6 +156,10 @@ def on_release_creator(event, press_time):
 
															 #         time.sleep(1)  # 每秒检查一次
														
 
															 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
														
 
															+    try:
														
 
															+        splitLine = param["splitLine"]
														
 
															+    except:
														
 
															+        param["splitLine"] = 0
														
 
															     if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
														
 
															         if param["nodeType"] <= 2:
														
 
															             if ignoreWaitElement or waitElement == "":