1 year ago · ea2d679dd4
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/294.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/294.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/295.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/295.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/296.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/296.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/297.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/297.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/298.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/298.json
--- a/ElectronJS/src/taskGrid/FlowChart.html
+++ b/ElectronJS/src/taskGrid/FlowChart.html
@@ -690,7 +690,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
 
				                     <input spellcheck=false onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
			
 
				                     <label>Export Data Format (Excel/CSV/TXT/Database):</label>
			
 
				                     <select id="outputFormat" class="form-control">
			
 
				-                        <option value = "xlsx">XLSX (EXCEL, note that a single Excel cell can save up to 32767 characters)</option>
			
 
				+                        <option value = "xlsx">XLSX (EXCEL, we suggest using the CSV format if the length of a single cell exceeds 500)</option>
			
 
				                         <option value = "csv">CSV</option>
			
 
				                         <option value = "txt">TXT</option>
			
 
				                         <option value = "json">JSON</option>
			
--- a/ElectronJS/src/taskGrid/FlowChart_CN.html
+++ b/ElectronJS/src/taskGrid/FlowChart_CN.html
@@ -690,7 +690,7 @@ print(emotlib.emoji()) # 使用其中的函数。
 
				                     <input spellcheck=false onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
			
 
				                     <label>导出数据格式（Excel/CSV/TXT/数据库，<a href="https://www.bilibili.com/video/BV1os4y1679S/" target="_blank">查看MySQL操作教程</a>）：</label>
			
 
				                     <select id="outputFormat" class="form-control">
			
 
				-                        <option value = "xlsx">XLSX（即EXCEL文件，注意Excel单个单元格最多可存储32767字符）</option>
			
 
				+                        <option value = "xlsx">XLSX（即EXCEL文件，建议单个单元格长度超过500时使用CSV格式存储）</option>
			
 
				                         <option value = "csv">CSV</option>
			
 
				                         <option value = "txt">TXT</option>
			
 
				                         <option value = "json">JSON</option>
			
--- a/ElectronJS/src/taskGrid/logic.js
+++ b/ElectronJS/src/taskGrid/logic.js
@@ -55,7 +55,12 @@ function changeOutputFormat(param) {
 
				             if (len > 20000) {
			
 
				                 if ($("#outputFormat").val() == "xlsx") {
			
 
				                     $("#outputFormat").val("csv"); //如果有一个参数的示例值长度超过20000，就默认输出为csv
			
 
				-                    showInfo(LANG("示例值长度超过16000，超出Excel单个单元格存储限制，已自动切换保存为csv格式。", "The length of the example value exceeds 16000, and the csv save format has been automatically switched."));
			
 
				+                    showInfo(LANG("单个字段示例值长度超过16000，超出Excel单个单元格存储限制，已自动切换保存为csv格式。", "The length of the example value of a single field exceeds 16000, which exceeds the storage limit of a single cell of Excel, and has been automatically switched to save as csv format."), 5000);
			
 
				+                }
			
 
				+                break;
			
 
				+            } else if (len > 500) {
			
 
				+                if ($("#outputFormat").val() == "xlsx") {
			
 
				+                    showInfo(LANG("单个字段示例值长度超过300，建议保存为CSV格式，否则可能会出现数据存储不完整的情况（Python Excel写入库openpyxl的Bug）。", "The length of the example value of a single field exceeds 300, it is recommended to save as CSV format, otherwise there may be a situation where the data storage is incomplete (Bug of Python Excel write library openpyxl)."), 10000);
			
 
				                 }
			
 
				                 break;
			
 
				             }
			
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@@ -12,7 +12,7 @@
 
				             "justMyCode": false,
			
 
				             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
			
 
				             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
			
 
				-            "args": ["--ids", "[8]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
			
 
				+            "args": ["--ids", "[28]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
			
 
				         "--read_type", "remote"]
			
 
				             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
			
 
				         }
			
--- a/ExecuteStage/requirements.txt
+++ b/ExecuteStage/requirements.txt
@@ -3,6 +3,7 @@ requests==2.31.0
 
				 selenium==4.16.0
			
 
				 pyinstaller==5.13.2
			
 
				 Pillow==10.0.1
			
 
				+xlsxwriter==3.1.9
			
 
				 openpyxl==3.1.2
			
 
				 pymysql==1.1.0
			
 
				 lxml==4.9.2
			
--- a/ExecuteStage/utils.py
+++ b/ExecuteStage/utils.py
@@ -9,6 +9,8 @@ import time
 
				 import uuid
			
 
				 # import keyboard
			
 
				 from openpyxl import Workbook, load_workbook
			
 
				+import pandas as pd
			
 
				+import xlsxwriter
			
 
				 import requests
			
 
				 from urllib.parse import urlparse
			
 
				 import pymysql
			
@@ -336,11 +338,115 @@ def write_to_json(file_name, data, types, record, keys):
 
				 
			
 
				 
			
 
				 def write_to_excel(file_name, data, types, record):
			
 
				+    # 首先，检查文件是否存在来决定是否处理第一行
			
 
				+    # first = not os.path.exists(file_name)
			
 
				+
			
 
				+    # # 准备新数据
			
 
				+    # new_data = pd.DataFrame(data)
			
 
				+
			
 
				+    # # 如果不是第一行（即文件已存在），对数据应用类型转换
			
 
				+    # if not first:
			
 
				+    #     for i, col_type in enumerate(types):
			
 
				+    #         if col_type == "int" or col_type == "bigInt":
			
 
				+    #             try:
			
 
				+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce').astype(int)
			
 
				+    #             except:
			
 
				+    #                 new_data[i] = pd.to_numeric("0", errors='coerce').astype(int)
			
 
				+    #         elif col_type == "double":
			
 
				+    #             try:
			
 
				+    #                 new_data[i] = pd.to_numeric(new_data[i], errors='coerce')(0.0)
			
 
				+    #             except:
			
 
				+    #                 new_data[i] = pd.to_numeric("0.0", errors='coerce').astype(float)
			
 
				+    # # 根据 record 筛选列
			
 
				+    # new_data = new_data.loc[:, record]
			
 
				+
			
 
				+    # # 如果文件存在，则读取现有数据并追加新数据
			
 
				+    # if first:
			
 
				+    #     combined_data = new_data
			
 
				+    # else:
			
 
				+    #     # 使用 Pandas 读取现有数据
			
 
				+    #     existing_data = pd.read_excel(file_name)
			
 
				+    #     # 合并现有数据与新数据
			
 
				+    #     combined_data = pd.concat([existing_data, new_data], ignore_index=True)
			
 
				+
			
 
				+    # # 将合并后的数据写入 Excel
			
 
				+    # combined_data.to_excel(file_name, index=False, engine='openpyxl')
			
 
				+
			
 
				+    # existing_data = []
			
 
				+    # first = True
			
 
				+    # # 检查文件是否存在
			
 
				+    # if os.path.exists(file_name):
			
 
				+    #     # 使用 openpyxl 读取现有数据
			
 
				+    #     workbook = load_workbook(file_name, read_only=True)
			
 
				+    #     sheet = workbook.active
			
 
				+    #     # 读取已有行数
			
 
				+    #     num_rows = sheet.max_row
			
 
				+    #     if num_rows > 5000:
			
 
				+    #         print("Excel文件中的数据行数超过5000行，过多的行数将会导致追加模式写入数据速度变慢，建议更换为CSV文件或MySQL数据库存储数据。正在读取数据，请稍等...")
			
 
				+    #         print("The number of rows in the Excel file exceeds 5000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to replace it with CSV file or MySQL database to store data. Reading data, please wait...")
			
 
				+    #     # existing_data = [[sheet.cell(row=i, column=j).value for j in range(1, sheet.max_column + 1)] for i in range(1, sheet.max_row + 1)]
			
 
				+    #     for i in range(1, sheet.max_row + 1):
			
 
				+    #         row_data = []
			
 
				+    #         if num_rows > 5000 and i % 500 == 0:
			
 
				+    #             print(f"正在读取第{i}/{num_rows}行的数据...")
			
 
				+    #             print(f"Reading data of row {i}/{num_rows}...")
			
 
				+    #         for j in range(1, sheet.max_column + 1):
			
 
				+    #             cell = sheet.cell(row=i, column=j).value
			
 
				+    #             if cell is None:
			
 
				+    #                 cell = ""
			
 
				+    #             row_data.append(cell)
			
 
				+    #         existing_data.append(row_data)
			
 
				+    #     first = False  # 如果文件存在，首行不再是标题行
			
 
				+
			
 
				+    # # 使用 xlsxwriter 创建新文件
			
 
				+    # workbook = xlsxwriter.Workbook(file_name)
			
 
				+    # worksheet = workbook.add_worksheet()
			
 
				+
			
 
				+    # # 写入现有数据
			
 
				+    # for row_num, row_data in enumerate(existing_data):
			
 
				+    #     for col_num, cell in enumerate(row_data):
			
 
				+    #         worksheet.write(row_num, col_num, cell)
			
 
				+
			
 
				+    # # 写入新数据
			
 
				+    # row = len(existing_data)
			
 
				+    # for line in data:
			
 
				+    #     to_write = []
			
 
				+    #     for i in range(len(line)):
			
 
				+    #         value = line[i]
			
 
				+    #         if not first:  # 如果不是第一行，需要转换数据类型
			
 
				+    #             if types[i] == "int" or types[i] == "bigInt":
			
 
				+    #                 try:
			
 
				+    #                     value = int(value)
			
 
				+    #                 except ValueError:
			
 
				+    #                     value = 0
			
 
				+    #             elif types[i] == "double":
			
 
				+    #                 try:
			
 
				+    #                     value = float(value)
			
 
				+    #                 except ValueError:
			
 
				+    #                     value = 0.0
			
 
				+    #         if record[i]:
			
 
				+    #             to_write.append(value)
			
 
				+    #     first = False  # 更新 first 以跳过数据类型转换
			
 
				+    #     for col, item in enumerate(to_write):
			
 
				+    #         worksheet.write(row, col, item)
			
 
				+    #     row += 1
			
 
				+
			
 
				+    # # 关闭工作簿
			
 
				+    # workbook.close()
			
 
				+
			
 
				     first = False
			
 
				     if os.path.exists(file_name):
			
 
				         # 加载现有的工作簿
			
 
				         wb = load_workbook(file_name)
			
 
				+        # 行数读取
			
 
				+        num_rows = wb.active.max_row
			
 
				+        if num_rows > 1000:
			
 
				+            print("Excel文件中的数据行数已超过1000行，过多的行数将会导致追加模式写入数据速度变慢，建议增大任务保存对话框中的“每采集多少条数据保存一次”选项的值以提升采集速度，或者更换为CSV文件或MySQL数据库存储数据。正在读取数据，请稍等...")
			
 
				+            print("The number of rows in the Excel file already exceeds 1000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to increase the value of the 'Save every how many data' option in the task save dialog to improve the collection speed, or replace it with CSV file or MySQL database to store data. Reading data, please wait...")
			
 
				         ws = wb.active
			
 
				+        if num_rows > 1000:
			
 
				+            print("读取数据完成，正在追加数据...")
			
 
				+            print("Reading data completed, appending data...")
			
 
				     else:
			
 
				         # 创建新的工作簿和工作表
			
 
				         wb = Workbook()