1 year ago · ae3ae40640
--- a/.temp_to_pub/.gitignore
+++ b/.temp_to_pub/.gitignore
@@ -11,3 +11,5 @@ config.json
 
				 mysql_config.json
			
 
				 **/Code
			
 
				 **/user_data
			
 
				+**/tasks
			
 
				+**/execution_instances
			
--- a/ElectronJS/src/taskGrid/FlowChart.html
+++ b/ElectronJS/src/taskGrid/FlowChart.html
@@ -320,6 +320,11 @@
 
				 <!--                            <option :value = 0>普通提取</option>-->
			
 
				 <!--                            <option :value = 1>OCR提取</option>-->
			
 
				 <!--                        </select>-->
			
 
				+                       <label style="margin-top: 15px">Wrap content to new line (set when collecting long articles and wanting to wrap):</label>
			
 
				+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
			
 
				+                            <option :value="0">No</option>
			
 
				+                            <option :value="1">Yes</option>
			
 
				+                        </select>
			
 
				                         <label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
			
 
				                         <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
			
 
				                             <option :value = 1>Yes</option>
			
--- a/ElectronJS/src/taskGrid/FlowChart_CN.html
+++ b/ElectronJS/src/taskGrid/FlowChart_CN.html
@@ -320,6 +320,11 @@
 
				 <!--                            <option :value = 0>普通提取</option>-->
			
 
				 <!--                            <option :value = 1>OCR提取</option>-->
			
 
				 <!--                        </select>-->
			
 
				+                        <label style="margin-top: 15px">是否将内容换行（长文章采集想要换行时设置）：</label>
			
 
				+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
			
 
				+                            <option :value = 0>否</option>
			
 
				+                            <option :value = 1>是</option>
			
 
				+                        </select>
			
 
				                         <label style="margin-top: 15px">是否保存该字段（只想把此字段当变量而不想保存时可选否）：</label>
			
 
				                         <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
			
 
				                             <option :value = 1>是</option>
			
--- a/ElectronJS/src/taskGrid/logic.js
+++ b/ElectronJS/src/taskGrid/logic.js
@@ -81,6 +81,7 @@ function changeGetDataParameters(msg, i) {
 
				     msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
			
 
				     msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
			
 
				     msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
			
 
				+    msg["parameters"][i]["splitLine"] = 0; //是否分割行
			
 
				 }
			
 
				 
			
 
				 
			
--- a/ElectronJS/tasks/294.json
+++ b/ElectronJS/tasks/294.json
--- a/ElectronJS/tasks/295.json
+++ b/ElectronJS/tasks/295.json
--- a/ElectronJS/tasks/296.json
+++ b/ElectronJS/tasks/296.json
--- a/ElectronJS/tasks/297.json
+++ b/ElectronJS/tasks/297.json
--- a/ElectronJS/tasks/298.json
+++ b/ElectronJS/tasks/298.json
--- a/ElectronJS/tasks/299.json
+++ b/ElectronJS/tasks/299.json
--- a/ElectronJS/tasks/300.json
+++ b/ElectronJS/tasks/300.json
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@@ -12,7 +12,7 @@
 
				             "justMyCode": false,
			
 
				             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
			
 
				             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
			
 
				-            "args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
			
 
				+            "args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
			
 
				         "--read_type", "remote"]
			
 
				             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
			
 
				         }
			
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@@ -6,8 +6,8 @@ import platform
 
				 import shutil
			
 
				 import string
			
 
				 import undetected_chromedriver as uc
			
 
				-from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
			
 
				-    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
			
 
				+from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
			
 
				+    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
			
 
				 from myChrome import MyChrome
			
 
				 from threading import Thread, Event
			
 
				 from PIL import Image
			
@@ -295,9 +295,13 @@ class BrowserThread(Thread):
 
				                     except:
			
 
				                         pass
			
 
				                     try:
			
 
				-                        node["parameters"]["recordASField"] += param["recordASField"]
			
 
				+                        node["parameters"]["recordASField"] = param["recordASField"]
			
 
				                     except:
			
 
				-                        node["parameters"]["recordASField"] += 1
			
 
				+                        node["parameters"]["recordASField"] = 1
			
 
				+                    try:
			
 
				+                        splitLine = int(param["splitLine"])
			
 
				+                    except:
			
 
				+                        param["splitLine"] = 0
			
 
				                     if param["contentType"] == 8:
			
 
				                         self.print_and_log(
			
 
				                             "默认的ddddocr识别功能如果觉得不好用，可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行；或者可以先设置采集内容类型为“元素截图”把图片保存下来，然后用自定义操作调用自己写的程序，程序的功能是读取这个最新生成的图片，然后用好用的模型，如PaddleOCR把图片识别出来，然后把返回值返回给程序作为参数输出。")
			
@@ -1754,7 +1758,11 @@ class BrowserThread(Thread):
 
				                     download_image(self, content, "Data/Task_" +
			
 
				                                    str(self.id) + "/" + self.saveName + "/", element)
			
 
				             else:  # 普通节点
			
 
				-                content = element.text
			
 
				+                if p["splitLine"] == 1:
			
 
				+                    text = extract_text_from_html(element.get_attribute('outerHTML'))
			
 
				+                    content = split_text_by_lines(text)
			
 
				+                else:
			
 
				+                    content = element.text
			
 
				         elif p["contentType"] == 1:  # 只采集当期元素下的文本，不包括子元素
			
 
				             if p["nodeType"] == 2:
			
 
				                 if element.get_attribute("href") != None:
			
--- a/ExecuteStage/requirements.txt
+++ b/ExecuteStage/requirements.txt
@@ -9,4 +9,5 @@ pymysql==1.1.0
 
				 lxml==4.9.2
			
 
				 ddddocr==1.4.10
			
 
				 pynput==1.7.6
			
 
				+beautifulsoup4==4.12.2
			
 
				 undetected-chromedriver==3.4.7
			
--- a/ExecuteStage/utils.py
+++ b/ExecuteStage/utils.py
@@ -7,6 +7,7 @@ import sys
 
				 import re
			
 
				 import time
			
 
				 import uuid
			
 
				+from bs4 import BeautifulSoup
			
 
				 # import keyboard
			
 
				 from openpyxl import Workbook, load_workbook
			
 
				 # import pandas as pd
			
@@ -71,6 +72,22 @@ def is_valid_url(url):
 
				 def lowercase_tags_in_xpath(xpath):
			
 
				     return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
			
 
				 
			
 
				+# 提取HTML中的文本内容
			
 
				+def extract_text_from_html(html_content):
			
 
				+    soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
			
 
				+    for script in soup(["script", "style"]): # 去除脚本和样式内容
			
 
				+        script.extract()
			
 
				+    for p_tag in soup.find_all("p"):
			
 
				+        p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
			
 
				+        p_tag.append("\n") # 在每个p标签后添加换行符
			
 
				+    text = soup.get_text()
			
 
				+    return text
			
 
				+
			
 
				+# 将文本按照行分割并去除额外空白
			
 
				+def split_text_by_lines(text):
			
 
				+    lines = text.splitlines()
			
 
				+    lines = [line.strip() for line in lines if line.strip()]  # 去除空行和首尾空格
			
 
				+    return "\n".join(lines)
			
 
				 
			
 
				 def on_press_creator(press_time, event):
			
 
				     def on_press(key):
			
@@ -139,7 +156,7 @@ def on_release_creator(event, press_time):
 
				 #         time.sleep(1)  # 每秒检查一次
			
 
				 
			
 
				 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
			
 
				-    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
			
 
				+    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
			
 
				         if param["nodeType"] <= 2:
			
 
				             if ignoreWaitElement or waitElement == "":
			
 
				                 return True