Browse Source

JSON Format Support

naibo 2 years ago
parent
commit
528ae7a132

+ 1 - 0
ElectronJS/src/taskGrid/FlowChart.html

@@ -639,6 +639,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
                         <option value = "xlsx">XLSX (EXCEL, note that a single Excel cell can save up to 32767 characters)</option>
                         <option value = "csv">CSV</option>
                         <option value = "txt">TXT</option>
+                        <option value = "json">JSON</option>
                         <option value = "mysql">MySQL Database</option>
                     </select>
                     <label>Export File Name/Database Table Name (Can use ../ to represent relative path to change the file save location,the keyword "current_time" will be replaced with the timestamp when the task is executed):</label>

+ 1 - 0
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -640,6 +640,7 @@ print(emotlib.emoji()) # 使用其中的函数。
                         <option value = "xlsx">XLSX(即EXCEL文件,注意Excel单个单元格最多可存储32767字符)</option>
                         <option value = "csv">CSV</option>
                         <option value = "txt">TXT</option>
+                        <option value = "json">JSON</option>
                         <option value = "mysql">MySQL数据库</option>
                     </select>
                     <label>导出文件名/数据库表格名称(可使用../表示相对路径以改变文件保存位置,名称中的“current_time”会被替换为执行任务时的时间戳):</label>

File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/112.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/200.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/201.json


+ 1 - 1
ExecuteStage/.vscode/launch.json

@@ -12,7 +12,7 @@
             "justMyCode": false,
             //  "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
             // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--id", "[8]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
+            "args": ["--id", "[16]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
         }
     ]
 }

+ 17 - 16
ExecuteStage/easyspider_executestage.py

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # import atexit
-from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
+from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel, write_to_json
 from myChrome import MyChrome
 from threading import Thread, Event
 from PIL import Image
@@ -152,27 +152,24 @@ class BrowserThread(Thread):
             filter(isnotnull, service["links"].split("\n")))  # 要执行的link的列表
         self.OUTPUT = []  # 采集的数据
         self.writeMode = 1  # 写入模式,0为新建,1为追加
-        if self.outputFormat == "csv" or self.outputFormat == "txt":
+        if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
             if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
                 self.OUTPUT.append([])  # 添加表头
                 self.writeMode = 0
-        elif self.outputFormat == "xlsx":
-            if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
-                self.OUTPUT.append([])  # 添加表头
-                self.writeMode = 0
+        elif self.outputFormat == "json":
+            self.writeMode = 3 # JSON模式无需判断是否存在文件
         elif self.outputFormat == "mysql":
             self.mysql = myMySQL(config["mysql_config_path"])
             self.mysql.create_table(self.saveName, service["outputParameters"])
             self.writeMode = 2
-        if self.writeMode == 1:
-            self.print_and_log("追加模式")
-            self.print_and_log("Append Mode")
-        elif self.writeMode == 0:
-            self.print_and_log("新建模式")
-            self.print_and_log("New Mode")
+        if self.writeMode == 0:
+            self.print_and_log("新建模式|Create Mode")
+        elif self.writeMode == 1:
+            self.print_and_log("追加模式|Append Mode")
         elif self.writeMode == 2:
-            self.print_and_log("MySQL模式")
-            self.print_and_log("MySQL Mode")
+            self.print_and_log("MySQL模式|MySQL Mode")
+        elif self.writeMode == 3:
+            self.print_and_log("JSON模式|JSON Mode")
         self.containJudge = service["containJudge"]  # 是否含有判断语句
         self.outputParameters = {}
         self.service = service
@@ -401,6 +398,10 @@ class BrowserThread(Thread):
                     str(self.id) + "/" + self.saveName + '.xlsx'
                 write_to_excel(
                     file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
+            elif self.outputFormat == "json":
+                file_name = "Data/Task_" + \
+                    str(self.id) + "/" + self.saveName + '.json'
+                write_to_json(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord, self.outputParameters.keys())
             elif self.outputFormat == "mysql":
                 self.mysql.write_to_mysql(
                     self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
@@ -1395,7 +1396,7 @@ class BrowserThread(Thread):
                 except:
                     downloadPic = 0
                 if downloadPic == 1:
-                    download_image(content, "Data/Task_" +
+                    download_image(self, content, "Data/Task_" +
                                    str(self.id) + "/" + self.saveName + "/")
             else:  # 普通节点
                 content = element.text
@@ -1420,7 +1421,7 @@ class BrowserThread(Thread):
                 except:
                     downloadPic = 0
                 if downloadPic == 1:
-                    download_image(content, "Data/Task_" +
+                    download_image(self, content, "Data/Task_" +
                                    str(self.id) + "/" + self.saveName + "/")
             else:
                 command = 'var arr = [];\

+ 42 - 7
ExecuteStage/utils.py

@@ -95,7 +95,7 @@ def on_release_creator(event, press_time):
 #         time.sleep(1)  # 每秒检查一次
 
 
-def download_image(url, save_directory):
+def download_image(browser, url, save_directory):
     # 定义浏览器头信息
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
@@ -120,15 +120,15 @@ def download_image(url, save_directory):
             with open(save_path, 'wb') as file:
                 file.write(response.content)
 
-            print("图片已成功下载到:", save_path)
-            print("The image has been successfully downloaded to:", save_path)
+            browser.print_and_log("图片已成功下载到:", save_path)
+            browser.print_and_log("The image has been successfully downloaded to:", save_path)
         else:
-            print("下载图片失败,请检查此图片链接是否有效:", url)
-            print(
+            browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
+            browser.print_and_log(
                 "Failed to download image, please check if this image link is valid:", url)
     else:
-        print("下载图片失败,请检查此图片链接是否有效:", url)
-        print("Failed to download image, please check if this image link is valid:", url)
+        browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
+        browser.print_and_log("Failed to download image, please check if this image link is valid:", url)
 
 
 def get_output_code(output):
@@ -182,6 +182,41 @@ def replace_field_values(orginal_text, outputParameters):
     return replaced_text
 
 
+def write_to_json(file_name, data, types, record, keys):
+    keys = list(keys)
+    # Prepare empty list for data
+    data_to_write = []
+    # Tranform data and append to list
+    for line in data:
+        to_write = {}
+        for i in range(len(line)):
+            if types[i] == "int" or types[i] == "bigInt":
+                try:
+                    line[i] = int(line[i])
+                except:
+                    line[i] = 0
+            elif types[i] == "double":
+                try:
+                    line[i] = float(line[i])
+                except:
+                    line[i] = 0.0
+            if record[i]:
+                 to_write.update({keys[i]: line[i]})
+        data_to_write.append(to_write)
+    
+    try:
+        # read data from JSON
+        with open(file_name, 'r', encoding='utf-8') as f:
+            json_data = json.load(f)
+    except:
+        json_data = []
+
+    json_data.extend(data_to_write)
+    
+    # write data to JSON
+    with open(file_name, 'w', encoding='utf-8') as f:
+        json.dump(json_data, f, ensure_ascii=False)
+
 def write_to_excel(file_name, data, types, record):
     first = False
     if os.path.exists(file_name):

Some files were not shown because too many files changed in this diff