Browse Source

Split Line

naibo 1 year ago
parent
commit
ae3ae40640

+ 2 - 0
.temp_to_pub/.gitignore

@@ -11,3 +11,5 @@ config.json
 mysql_config.json
 **/Code
 **/user_data
+**/tasks
+**/execution_instances

+ 5 - 0
ElectronJS/src/taskGrid/FlowChart.html

@@ -320,6 +320,11 @@
 <!--                            <option :value = 0>普通提取</option>-->
 <!--                            <option :value = 1>OCR提取</option>-->
 <!--                        </select>-->
+                       <label style="margin-top: 15px">Wrap content to new line (set when collecting long articles and wanting to wrap):</label>
+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
+                            <option :value="0">No</option>
+                            <option :value="1">Yes</option>
+                        </select>
                         <label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
                         <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
                             <option :value = 1>Yes</option>

+ 5 - 0
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -320,6 +320,11 @@
 <!--                            <option :value = 0>普通提取</option>-->
 <!--                            <option :value = 1>OCR提取</option>-->
 <!--                        </select>-->
+                        <label style="margin-top: 15px">是否将内容换行(长文章采集想要换行时设置):</label>
+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
+                            <option :value = 0>否</option>
+                            <option :value = 1>是</option>
+                        </select>
                         <label style="margin-top: 15px">是否保存该字段(只想把此字段当变量而不想保存时可选否):</label>
                         <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
                             <option :value = 1>是</option>

+ 1 - 0
ElectronJS/src/taskGrid/logic.js

@@ -81,6 +81,7 @@ function changeGetDataParameters(msg, i) {
     msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
     msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
     msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
+    msg["parameters"][i]["splitLine"] = 0; //是否分割行
 }
 
 

File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/294.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/295.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/296.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/297.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/298.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/299.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/300.json


+ 1 - 1
ExecuteStage/.vscode/launch.json

@@ -12,7 +12,7 @@
             "justMyCode": false,
             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
+            "args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
         "--read_type", "remote"]
             // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
         }

+ 13 - 5
ExecuteStage/easyspider_executestage.py

@@ -6,8 +6,8 @@ import platform
 import shutil
 import string
 import undetected_chromedriver as uc
-from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
-    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
+from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
+    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
 from myChrome import MyChrome
 from threading import Thread, Event
 from PIL import Image
@@ -295,9 +295,13 @@ class BrowserThread(Thread):
                     except:
                         pass
                     try:
-                        node["parameters"]["recordASField"] += param["recordASField"]
+                        node["parameters"]["recordASField"] = param["recordASField"]
                     except:
-                        node["parameters"]["recordASField"] += 1
+                        node["parameters"]["recordASField"] = 1
+                    try:
+                        splitLine = int(param["splitLine"])
+                    except:
+                        param["splitLine"] = 0
                     if param["contentType"] == 8:
                         self.print_and_log(
                             "默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
@@ -1754,7 +1758,11 @@ class BrowserThread(Thread):
                     download_image(self, content, "Data/Task_" +
                                    str(self.id) + "/" + self.saveName + "/", element)
             else:  # 普通节点
-                content = element.text
+                if p["splitLine"] == 1:
+                    text = extract_text_from_html(element.get_attribute('outerHTML'))
+                    content = split_text_by_lines(text)
+                else:
+                    content = element.text
         elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
             if p["nodeType"] == 2:
                 if element.get_attribute("href") != None:

+ 1 - 0
ExecuteStage/requirements.txt

@@ -9,4 +9,5 @@ pymysql==1.1.0
 lxml==4.9.2
 ddddocr==1.4.10
 pynput==1.7.6
+beautifulsoup4==4.12.2
 undetected-chromedriver==3.4.7

+ 18 - 1
ExecuteStage/utils.py

@@ -7,6 +7,7 @@ import sys
 import re
 import time
 import uuid
+from bs4 import BeautifulSoup
 # import keyboard
 from openpyxl import Workbook, load_workbook
 # import pandas as pd
@@ -71,6 +72,22 @@ def is_valid_url(url):
 def lowercase_tags_in_xpath(xpath):
     return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
 
+# 提取HTML中的文本内容
+def extract_text_from_html(html_content):
+    soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
+    for script in soup(["script", "style"]): # 去除脚本和样式内容
+        script.extract()
+    for p_tag in soup.find_all("p"):
+        p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
+        p_tag.append("\n") # 在每个p标签后添加换行符
+    text = soup.get_text()
+    return text
+
+# 将文本按照行分割并去除额外空白
+def split_text_by_lines(text):
+    lines = text.splitlines()
+    lines = [line.strip() for line in lines if line.strip()]  # 去除空行和首尾空格
+    return "\n".join(lines)
 
 def on_press_creator(press_time, event):
     def on_press(key):
@@ -139,7 +156,7 @@ def on_release_creator(event, press_time):
 #         time.sleep(1)  # 每秒检查一次
 
 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
-    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
+    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
         if param["nodeType"] <= 2:
             if ignoreWaitElement or waitElement == "":
                 return True

Some files were not shown because too many files changed in this diff