Просмотр исходного кода

MacOS with two execute stage version

Naibo_Mac_M2 2 лет назад
Родитель
Сommit
476cec0537

+ 1 - 0
.temp_to_pub/.gitignore

@@ -1,4 +1,5 @@
 EasySpider_MacOS/easyspider_executestage
+EasySpider_MacOS/easyspider_executestage_full
 EasySpider_Linux64_x64/user_data
 EasySpider_windows_x32/user_data
 EasySpider

+ 1 - 1
ElectronJS/config.json

@@ -1 +1 @@
-{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
+{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}

+ 20 - 20
ElectronJS/main.js

@@ -59,27 +59,27 @@ let chromeBinaryPath = "";
 let execute_path = "";
 console.log(process.arch);
 
-exec(`wmic os get Caption`, function (error, stdout, stderr) {
-    if (error) {
-        console.error(`执行的错误: ${error}`);
-        return;
-    }
+// exec(`wmic os get Caption`, function (error, stdout, stderr) {
+//     if (error) {
+//         console.error(`执行的错误: ${error}`);
+//         return;
+//     }
 
-    if (stdout.includes("Windows 7")) {
-        console.log("Windows 7");
-        let sys_arch = config.sys_arch;
-        if (sys_arch === "x64") {
-            dialog.showMessageBoxSync({
-                type: "error",
-                title: "Error",
-                message:
-                    "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
-            });
-        }
-    } else {
-        console.log("Not Windows 7");
-    }
-});
+//     if (stdout.includes("Windows 7")) {
+//         console.log("Windows 7");
+//         let sys_arch = config.sys_arch;
+//         if (sys_arch === "x64") {
+//             dialog.showMessageBoxSync({
+//                 type: "error",
+//                 title: "Error",
+//                 message:
+//                     "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
+//             });
+//         }
+//     } else {
+//         console.log("Not Windows 7");
+//     }
+// });
 
 if (process.platform === "win32" && process.arch === "ia32") {
     driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");

+ 4 - 1
ElectronJS/src/taskGrid/executeTask.html

@@ -79,8 +79,9 @@
                     <div class="modal-body">
                         <input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
                         <input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
-                        <label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
                         <label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction" target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
+                        <label v-if="OS=='darwin'">{{`对于MacOS系统,EasySpider提供了两个不同的执行程序,分别为easyspider_executestage和easyspider_executestage_full,前者执行时加载速度较快,并提供了除OCR识别和数据去重以外的全部功能;后者则提供了包括OCR识别和数据去重在内的全部功能,但运行时加载速度较慢,需要等待2-10分钟才能执行程序,请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
+                        <label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
                         <textarea class="form-control" style="height:150px">cd {{easyspider_location}}
 {{command}} --config_folder "{{config_folder}}" --headless 0 --read_type remote --config_file_name config.json --saved_file_name </textarea>
                     </div>
@@ -314,6 +315,7 @@
             config_folder: "",
             easyspider_location: "",
             mysql_config_path: "",
+            OS: "win32",
         }, mounted() {
         $.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
             app.$data.user_data_folder = result.user_data_folder;
@@ -540,6 +542,7 @@
 
     function changeCommand() {
         $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
+            app.$data.OS = OSInfo.version;
             if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){
                 app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
             } else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){

Разница между файлами не показана из-за своего большого размера
+ 0 - 0
ElectronJS/tasks/308.json


+ 16 - 7
ExecuteStage/easyspider_executestage.py

@@ -44,14 +44,25 @@ import sys
 # import hashlib
 import time
 import requests
-from ddddocr import DdddOcr
+from multiprocessing import freeze_support
+freeze_support()  # 防止无限死循环多开
+try:
+    from ddddocr import DdddOcr
+    import onnxruntime
+    onnxruntime.set_default_logger_severity(3)  # 隐藏onnxruntime的日志
+except:
+    print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
+    print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
+    time.sleep(2)
 from urllib.parse import urljoin
 from lxml import etree, html
+try:
+    import pandas as pd
+except:
+    print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
+    print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
+    time.sleep(2)
 
-import onnxruntime
-
-onnxruntime.set_default_logger_severity(3)  # 隐藏onnxruntime的日志
-import pandas as pd
 # import numpy
 # import pytesseract
 # import uuid
@@ -2185,8 +2196,6 @@ class BrowserThread(Thread):
             self.OUTPUT.append(line)
 
 if __name__ == '__main__':
-    from multiprocessing import freeze_support
-    freeze_support()  # 防止无限死循环多开
     # 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
     # If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
     config = {

+ 12 - 2
ExecuteStage/generateExecutable_Macos.sh

@@ -1,7 +1,17 @@
+# 先打包一个不带ddddocr和pandas的版本,然后再打包一个带的版本,不带ddddocr和pandas的版本运行速度会快很多
 rm -r build
 rm -r dist
-pyinstaller -F --icon=favicon.ico  --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi"  --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
+pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn 
+
 rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
-rm ../ElectronJS/easyspider_executestage
 cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
 # mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
+
+echo "With ddddocr and pandas"
+
+# # 打包带ddddocr和pandas的版本
+rm -r build
+rm -r dist
+pyinstaller -F --icon=favicon.ico  --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi"  --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
+rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
+cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full

+ 34 - 0
ExecuteStage/package_size.py

@@ -0,0 +1,34 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+# 获取当前Python环境的lib路径
+lib_path = Path(sys.prefix) / "lib"
+
+# 使用pip列出所有已安装的包及其版本
+installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:]
+
+# 初始化一个字典来保存数据
+package_sizes = {}
+
+# 对于每个已安装的包,找到对应的路径并计算大小
+for package in installed_packages:
+    name, version = package.split()[:2]
+    package_size = 0
+
+    # 寻找与包名相关的顶层目录
+    # 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。
+    # 例如,Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf'
+    # 这需要特别处理或者使用包的元数据来找到正确的顶层目录。
+    package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name
+
+    # 计算文件夹大小
+    if package_dir.exists():
+        package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file())
+
+    package_sizes[name] = package_size
+
+# 将包按大小排序并输出
+for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True):
+    print(f"{name}: {size/1024/1024:.2f} MB")

+ 0 - 44
ExecuteStage/test.py

@@ -1,44 +0,0 @@
-# from lxml import etree
-
-# # 解析HTML
-# html = """
-# <div>
-# 123
-#   <ul class="list">
-#     <li class="item-0">first item</li>
-#     <li class="item-1"><a href="link2.html">second item</a></li>
-#   </ul>
-#   456
-#   <div></div>
-#   789
-# </div>
-# """
-# html = etree.HTML(html)
-# element = html.xpath("*")
-# direct_text = "/html/body/" + html[0][0].tag + "/text()"
-# all_text = "/html/body/" + html[0][0].tag + "//text()"
-# # 使用XPath选择元素
-# results = html.xpath(direct_text)
-# # print(results)
-# # 拼接所有文本内容并去掉两边的空白
-# text = ' '.join(result.strip() for result in results if result.strip())
-
-# # 输出结果
-# print(text)
-
-# results = html.xpath(all_text)
-# # print(results)
-# # 拼接所有文本内容并去掉两边的空白
-# text = ' '.join(result.strip() for result in results if result.strip())
-
-# # 输出结果
-# print(text)
-
-import re
-
-def lowercase_xpath_tags(xpath):
-    return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
-
-print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
-print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
-print("")

Некоторые файлы не были показаны из-за большого количества измененных файлов