Jelajahi Sumber

新增下载图片功能

naibo 2 tahun lalu
induk
melakukan
42db55deb8

+ 4 - 0
ElectronJS/.gitignore

@@ -10,3 +10,7 @@ user_data/
 Data/
 Chrome/
 execution_instances/*
+EasySpider_en.crx
+EasySpider_zh.crx
+.DS_Store
+npminstall-debug.log

TEMPAT SAMPAH
ElectronJS/EasySpider_en.crx


TEMPAT SAMPAH
ElectronJS/EasySpider_zh.crx


+ 9 - 2
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -235,6 +235,13 @@
                             <option :value = 3>表单值</option>
                             <option :value = 4>图片地址</option>
                         </select>
+                        <div v-if='paras.parameters[paraIndex]["nodeType"] == 4'>
+                            <label>提取图片地址后是否同时下载图片</label>
+                            <select v-model='paras.parameters[paraIndex]["downloadPic"]' class="form-control">
+                                <option :value = 0>否</option>
+                                <option :value = 1>是</option>
+                            </select>
+                        </div>
 <!--                        <label>提取方式</label>-->
 <!--                        <select v-model='paras.parameters[paraIndex]["extractType"]' class="form-control">-->
 <!--                            <option :value = 0>普通提取</option>-->
@@ -390,8 +397,8 @@
                         <input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                     </div>
                     <div v-else-if='TClass == 7'>
-                        <label>代码/脚本内容: </label>
-                        <textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.indexOf('123') >=0 即判断当前循环项的文本是否包含123,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
+                        <label>代码/脚本内容(<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例): </label>
+                        <textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.length >=5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
                         <label>最长等待脚本执行时间(0代表无限等待): </label>
                         <input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                     </div>

+ 1 - 0
ElectronJS/src/taskGrid/logic_CN.js

@@ -44,6 +44,7 @@ function changeGetDataParameters(msg, i) {
     msg["parameters"][i]["JSWaitTime"] = 0; //JS等待时间
     msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
     msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
+    msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
 }
 
 function handleAddElement(msg) {

+ 335 - 0
ElectronJS/tasks/57.json

@@ -0,0 +1,335 @@
+{
+    "id": 57,
+    "name": "图片下载",
+    "url": "https://www.jd.com",
+    "links": "https://www.jd.com",
+    "create_time": "5/20/2023, 8:18:15 PM",
+    "containJudge": false,
+    "desc": "https://www.jd.com",
+    "inputParameters": [
+        {
+            "id": 0,
+            "name": "urlList_0",
+            "nodeId": 1,
+            "nodeName": "打开网页",
+            "value": "https://www.jd.com",
+            "desc": "要采集的网址列表,多行以\\n分开",
+            "type": "string",
+            "exampleValue": "https://www.jd.com"
+        }
+    ],
+    "outputParameters": [
+        {
+            "id": 0,
+            "name": "参数3_图片地址",
+            "desc": "",
+            "type": "string",
+            "exampleValue": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
+        }
+    ],
+    "graph": [
+        {
+            "index": 0,
+            "id": 0,
+            "parentId": 0,
+            "type": -1,
+            "option": 0,
+            "title": "root",
+            "sequence": [
+                1,
+                4
+            ],
+            "parameters": {
+                "history": 1,
+                "tabIndex": 0,
+                "useLoop": false,
+                "xpath": "",
+                "wait": 0
+            },
+            "isInLoop": false
+        },
+        {
+            "id": 1,
+            "index": 1,
+            "parentId": 0,
+            "type": 0,
+            "option": 1,
+            "title": "打开网页",
+            "sequence": [],
+            "isInLoop": false,
+            "position": 0,
+            "parameters": {
+                "useLoop": false,
+                "xpath": "",
+                "wait": 0,
+                "beforeJS": "",
+                "beforeJSWaitTime": 0,
+                "afterJS": "",
+                "afterJSWaitTime": 0,
+                "url": "https://www.jd.com",
+                "links": "https://www.jd.com",
+                "maxWaitTime": 10,
+                "scrollType": 0,
+                "scrollCount": 0
+            }
+        },
+        {
+            "id": -1,
+            "index": 2,
+            "parentId": 0,
+            "type": 1,
+            "option": 8,
+            "title": "循环",
+            "sequence": [
+                3
+            ],
+            "isInLoop": false,
+            "position": 1,
+            "parameters": {
+                "history": 4,
+                "tabIndex": -1,
+                "useLoop": false,
+                "xpath": "/html/body/div[4]/div[1]/div[4]/a",
+                "wait": 0,
+                "beforeJS": "",
+                "beforeJSWaitTime": 0,
+                "afterJS": "",
+                "afterJSWaitTime": 0,
+                "scrollType": 0,
+                "scrollCount": 0,
+                "loopType": 1,
+                "pathList": "",
+                "textList": "",
+                "code": "",
+                "waitTime": 0,
+                "exitCount": 0,
+                "historyWait": 2,
+                "allXPaths": [
+                    "/html/body/div[4]/div[1]/div[4]/a[1]",
+                    "//a[contains(., '平板電腦')]"
+                ]
+            }
+        },
+        {
+            "id": -1,
+            "index": 3,
+            "parentId": 2,
+            "type": 0,
+            "option": 3,
+            "title": "提取数据",
+            "sequence": [],
+            "isInLoop": true,
+            "position": 0,
+            "parameters": {
+                "history": 4,
+                "tabIndex": -1,
+                "useLoop": false,
+                "xpath": "",
+                "wait": 0,
+                "beforeJS": "",
+                "beforeJSWaitTime": 0,
+                "afterJS": "",
+                "afterJSWaitTime": 0,
+                "paras": [
+                    {
+                        "nodeType": 1,
+                        "contentType": 0,
+                        "relative": true,
+                        "name": "参数1_链接文本",
+                        "desc": "",
+                        "extractType": 0,
+                        "relativeXPath": "",
+                        "allXPaths": "",
+                        "exampleValues": [
+                            {
+                                "num": 0,
+                                "value": "平板電腦"
+                            },
+                            {
+                                "num": 1,
+                                "value": "爆款耳機"
+                            },
+                            {
+                                "num": 2,
+                                "value": "手機"
+                            },
+                            {
+                                "num": 3,
+                                "value": "數據線"
+                            },
+                            {
+                                "num": 4,
+                                "value": "年貨節"
+                            }
+                        ],
+                        "default": "",
+                        "beforeJS": "",
+                        "beforeJSWaitTime": 0,
+                        "JS": "",
+                        "JSWaitTime": 0,
+                        "afterJS": "",
+                        "afterJSWaitTime": 0,
+                        "downloadPic": 0
+                    },
+                    {
+                        "nodeType": 2,
+                        "contentType": 0,
+                        "relative": true,
+                        "name": "参数2_链接地址",
+                        "desc": "",
+                        "relativeXPath": "",
+                        "allXPaths": "",
+                        "exampleValues": [
+                            {
+                                "num": 0,
+                                "value": "https://search.jd.com/Search?keyword=%E5%B9%B3%E6%9D%BF%E7%94%B5%E8%84%91&enc=utf-8&wq=%E5%B9%B3%E6%9D%BF%E7%94%B5%E8%84%91&pvid=84c62205dccd43dfad1b6eb5fdf5077b"
+                            },
+                            {
+                                "num": 1,
+                                "value": "https://audio.jd.com/"
+                            },
+                            {
+                                "num": 2,
+                                "value": "https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&ev=exbrand_%E5%B0%8F%E7%B1%B3%EF%BC%88MI%EF%BC%89%5E&uc=0#J_searchWrap"
+                            },
+                            {
+                                "num": 3,
+                                "value": "https://mall.jd.com/index-1000007418.html"
+                            },
+                            {
+                                "num": 4,
+                                "value": "https://pro.jd.com/mall/active/22WyJjMqTCbvjj1YB3pSJssBonLR/index.html"
+                            }
+                        ],
+                        "default": "",
+                        "beforeJS": "",
+                        "beforeJSWaitTime": 0,
+                        "JS": "",
+                        "JSWaitTime": 0,
+                        "afterJS": "",
+                        "afterJSWaitTime": 0,
+                        "downloadPic": 0
+                    }
+                ],
+                "loopType": 1
+            }
+        },
+        {
+            "id": 2,
+            "index": 4,
+            "parentId": 0,
+            "type": 1,
+            "option": 8,
+            "title": "循环",
+            "sequence": [
+                5
+            ],
+            "isInLoop": false,
+            "position": 1,
+            "parameters": {
+                "history": 4,
+                "tabIndex": -1,
+                "useLoop": false,
+                "xpath": "/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div[1]/a[1]/img[1]",
+                "wait": 0,
+                "beforeJS": "",
+                "beforeJSWaitTime": 0,
+                "afterJS": "",
+                "afterJSWaitTime": 0,
+                "scrollType": 0,
+                "scrollCount": 0,
+                "loopType": 1,
+                "pathList": "",
+                "textList": "",
+                "code": "",
+                "waitTime": 0,
+                "exitCount": 0,
+                "historyWait": 2,
+                "allXPaths": [
+                    "/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/a[1]/img[1]",
+                    "//img[contains(., '')]"
+                ]
+            }
+        },
+        {
+            "id": 3,
+            "index": 5,
+            "parentId": 2,
+            "type": 0,
+            "option": 3,
+            "title": "提取数据",
+            "sequence": [],
+            "isInLoop": true,
+            "position": 0,
+            "parameters": {
+                "history": 4,
+                "tabIndex": -1,
+                "useLoop": false,
+                "xpath": "",
+                "wait": 0,
+                "beforeJS": "",
+                "beforeJSWaitTime": 0,
+                "afterJS": "",
+                "afterJSWaitTime": 0,
+                "paras": [
+                    {
+                        "nodeType": 4,
+                        "contentType": 0,
+                        "relative": true,
+                        "name": "参数3_图片地址",
+                        "desc": "",
+                        "extractType": 0,
+                        "relativeXPath": "",
+                        "allXPaths": "",
+                        "exampleValues": [
+                            {
+                                "num": 0,
+                                "value": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
+                            },
+                            {
+                                "num": 1,
+                                "value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 2,
+                                "value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/222655/28/27238/153145/644b858eF2cd1200f/e37bd7da42a814b0.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 3,
+                                "value": "//m.360buyimg.com/babel/s710x370_jfs/t1/197659/30/31344/62825/640fd751F694963ed/a6e1ac2e5c27f160.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 4,
+                                "value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 5,
+                                "value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/222655/28/27238/153145/644b858eF2cd1200f/e37bd7da42a814b0.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 6,
+                                "value": "//m.360buyimg.com/babel/s710x370_jfs/t1/197659/30/31344/62825/640fd751F694963ed/a6e1ac2e5c27f160.jpg!q70.dpg"
+                            },
+                            {
+                                "num": 7,
+                                "value": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
+                            },
+                            {
+                                "num": 8,
+                                "value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
+                            }
+                        ],
+                        "default": "",
+                        "beforeJS": "",
+                        "beforeJSWaitTime": 0,
+                        "JS": "",
+                        "JSWaitTime": 0,
+                        "afterJS": "",
+                        "afterJSWaitTime": 0,
+                        "downloadPic": 1
+                    }
+                ]
+            }
+        }
+    ]
+}

File diff ditekan karena terlalu besar
+ 0 - 0
ElectronJS/tasks/58.json


File diff ditekan karena terlalu besar
+ 0 - 0
ElectronJS/tasks/59.json


+ 2 - 1
ExecuteStage/.gitignore

@@ -11,4 +11,5 @@ Data/
 tasks/
 Application/
 .history
-execution_instances/
+execution_instances/
+.DS_Store

+ 1 - 1
ExecuteStage/.vscode/launch.json

@@ -12,7 +12,7 @@
             "console": "integratedTerminal",
             "justMyCode": true,
             // "args": ["--id", "38", "--read_type", "local", "--headless", "1"]
-            "args": ["--id", "10", "--headless", "0"]
+            "args": ["--id", "15", "--headless", "0"]
         }
     ]
 }

+ 166 - 146
ExecuteStage/easyspider_executestage.py

@@ -30,7 +30,7 @@ from selenium.webdriver.common.by import By
 from commandline_config import Config
 import pytesseract
 from PIL import Image
-
+import uuid
 
 saveName, log, OUTPUT, browser, SAVED = None, "", "", None, False
 
@@ -65,6 +65,38 @@ def Log(text, text2=""):
 # 屏幕滚动函数
 
 
+
+def download_image(url, save_directory):
+    # 定义浏览器头信息
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    
+    # 发送 GET 请求获取图片数据
+    response = requests.get(url, headers=headers)
+
+    # 检查响应状态码是否为成功状态
+    if response.status_code == requests.codes.ok:
+        # 提取文件名
+        file_name = url.split('/')[-1]
+        
+        # 生成唯一的新文件名
+        new_file_name = str(uuid.uuid4()) + '_' + file_name
+        
+        # 构建保存路径
+        save_path = os.path.join(save_directory, new_file_name)
+        
+        # 保存图片到本地
+        with open(save_path, 'wb') as file:
+            file.write(response.content)
+        
+        print("图片已成功下载到:", save_path)
+        print("The image has been successfully downloaded to:", save_path)
+    else:
+        print("下载图片失败,请检查此图片链接是否有效:", url)
+        print("Failed to download image, please check if this image link is valid:", url)
+
+
 def scrollDown(para, rt=""):
     try:
         if para["scrollType"] != 0 and para["scrollCount"] > 0:  # 控制屏幕向下滚动
@@ -180,6 +212,7 @@ def executeNode(nodeId, loopValue="", clickPath="", index=0):
         inputInfo(node["parameters"], loopValue)
     elif node["option"] == 5:  # 自定义操作
         customOperation(node, loopValue)
+        saveData()
     elif node["option"] == 8:  # 循环
         recordLog("loop")
         loopExcute(node, loopValue, clickPath, index)  # 执行循环
@@ -644,8 +677,8 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
                 recordLog('Element %s not found, use default' % p["relativeXPath"])
                 continue
             except TimeoutException:  # 超时的时候设置超时值
-                Log('time out after 10 seconds when getting data')
-                recordLog('time out after 10 seconds when getting data')
+                Log('time out after set seconds when getting data')
+                recordLog('time out after set seconds when getting data')
                 browser.execute_script('window.stop()')
                 if p["relative"]:  # 是否相对xpath
                     if p["relativeXPath"] == "":  # 相对xpath有时候就是元素本身,不需要二次查找
@@ -660,104 +693,44 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
             element = browser.find_element(By.XPATH, "//body")
         try:
             execute_code(2, p["beforeJS"], p["beforeJSWaitTime"], element) # 执行前置js
-            if p["contentType"] == 2:
-                content = element.get_attribute('innerHTML')
-            elif p["contentType"] == 3:
-                content = element.get_attribute('outerHTML')
-            elif p["contentType"] == 4:
-                # 获取元素的背景图片地址
-                bg_url = element.value_of_css_property('background-image')
-                # 清除背景图片地址中的多余字符
-                bg_url = bg_url.replace('url("', '').replace('")', '')
-                content = bg_url
-            elif p["contentType"] == 5:
-                content = browser.current_url
-            elif p["contentType"] == 6:
-                content = browser.title
-            elif p["contentType"] == 7:
-                # 获取整个网页的高度和宽度
-                height = browser.execute_script("return document.body.scrollHeight");
-                width = browser.execute_script("return document.body.scrollWidth");
-                # 调整浏览器窗口的大小
-                browser.set_window_size(width, height)
-                element.screenshot("Data/" +saveName + "/"+ str(time.time()) + ".png")
-            elif p["contentType"] == 8:
+            # 先处理特殊节点类型
+            if p["nodeType"] == 2:
+                if element.get_attribute("href") != None:
+                    content = element.get_attribute("href")
+                else:
+                    content = ""
+            elif p["nodeType"] == 3:
+                if element.get_attribute("value") != None:
+                    content = element.get_attribute("value")
+                else:
+                    content = ""
+            elif p["nodeType"] == 4:  # 图片
+                if element.get_attribute("src") != None:
+                    content = element.get_attribute("src")
+                else:
+                    content = ""
                 try:
-                    screenshot = element.screenshot_as_png
-                    screenshot_stream = io.BytesIO(screenshot)
-                    # 使用Pillow库打开截图,并转换为灰度图像
-                    image = Image.open(screenshot_stream).convert('L')
-                    # 使用Tesseract OCR引擎识别图像中的文本
-                    text = pytesseract.image_to_string(image,  lang='chi_sim+eng')
-                    content = text
-                except Exception as e:
-                    content = "OCR Error"
-                    print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
-                    print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中:https://blog.csdn.net/u010454030/article/details/80515501")
-            elif p["contentType"] == 9:
-                content = execute_code(2, p["JS"], p["JSWaitTime"], element)
-            elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
-                command = 'var arr = [];\
-                var content = arguments[0];\
-                for(var i = 0, len = content.childNodes.length; i < len; i++) {\
-                    if(content.childNodes[i].nodeType === 3){  \
-                        arr.push(content.childNodes[i].nodeValue);\
+                    downloadPic = p["downloadPic"]
+                except:
+                    downloadPic = 0
+                if downloadPic == 1:
+                    download_image(content, "Data/" +saveName + "/")
+            else: # 普通节点
+                if p["contentType"] == 0:
+                    content = element.text
+                elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
+                    command = 'var arr = [];\
+                    var content = arguments[0];\
+                    for(var i = 0, len = content.childNodes.length; i < len; i++) {\
+                        if(content.childNodes[i].nodeType === 3){  \
+                            arr.push(content.childNodes[i].nodeValue);\
+                        }\
                     }\
-                }\
-                var str = arr.join(" "); \
-                return str;'
-                content = browser.execute_script(command, element).replace(
-                    "\n", "").replace("\\s+", " ")
-                if p["nodeType"] == 2:
-                    if element.get_attribute("href") != None:
-                        content = element.get_attribute("href")
-                    else:
-                        content = ""
-                elif p["nodeType"] == 3:
-                    if element.get_attribute("value") != None:
-                        content = element.get_attribute("value")
-                    else:
-                        content = ""
-                elif p["nodeType"] == 4:  # 图片
-                    if element.get_attribute("src") != None:
-                        content = element.get_attribute("src")
-                    else:
-                        content = ""
-            elif p["contentType"] == 0:
-                content = element.text
-                if p["nodeType"] == 2:
-                    if element.get_attribute("href") != None:
-                        content = element.get_attribute("href")
-                    else:
-                        content = ""
-                elif p["nodeType"] == 3:
-                    if element.get_attribute("value") != None:
-                        content = element.get_attribute("value")
-                    else:
-                        content = ""
-                elif p["nodeType"] == 4:  # 图片
-                    if element.get_attribute("src") != None:
-                        content = element.get_attribute("src")
-                    else:
-                        content = ""
-        except StaleElementReferenceException:  # 发生找不到元素的异常后,等待几秒重新查找
-            recordLog('StaleElementReferenceException:'+p["relativeXPath"])
-            time.sleep(3)
-            try:
-                if p["relative"]:  # 是否相对xpath
-                    if p["relativeXPath"] == "":  # 相对xpath有时候就是元素本身,不需要二次查找
-                        element = loopElement
-                        recordLog('StaleElementReferenceException:loopElement')
-                    else:
-                        element = loopElement.find_element(By.XPATH,
-                                                           p["relativeXPath"][1:])
-                        recordLog(
-                            'StaleElementReferenceException:loopElement+relativeXPath')
-                else:
-                    element = browser.find_element(
-                        By.XPATH, p["relativeXPath"])
-                    recordLog('StaleElementReferenceException:relativeXPath')
-                if p["contentType"] == 2:
+                    var str = arr.join(" "); \
+                    return str;'
+                    content = browser.execute_script(command, element).replace(
+                        "\n", "").replace("\\s+", " ")
+                elif p["contentType"] == 2:
                     content = element.get_attribute('innerHTML')
                 elif p["contentType"] == 3:
                     content = element.get_attribute('outerHTML')
@@ -788,55 +761,101 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
                         text = pytesseract.image_to_string(image,  lang='chi_sim+eng')
                         content = text
                     except Exception as e:
-                        content = "OCR失败"
-                        print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable path: https://tesseract-ocr.github.io/tessdoc/Installation.html")
-                        print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量path中:")
+                        content = "OCR Error"
+                        print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
+                        print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中:https://blog.csdn.net/u010454030/article/details/80515501")
                 elif p["contentType"] == 9:
                     content = execute_code(2, p["JS"], p["JSWaitTime"], element)
-                elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
-                    command = 'var arr = [];\
-                    var content = arguments[0];\
-                    for(var i = 0, len = content.childNodes.length; i < len; i++) {\
-                        if(content.childNodes[i].nodeType === 3){  \
-                            arr.push(content.childNodes[i].nodeValue);\
+        except StaleElementReferenceException:  # 发生找不到元素的异常后,等待几秒重新查找
+            recordLog('StaleElementReferenceException:'+p["relativeXPath"])
+            time.sleep(3)
+            try:
+                if p["relative"]:  # 是否相对xpath
+                    if p["relativeXPath"] == "":  # 相对xpath有时候就是元素本身,不需要二次查找
+                        element = loopElement
+                        recordLog('StaleElementReferenceException:loopElement')
+                    else:
+                        element = loopElement.find_element(By.XPATH,
+                                                           p["relativeXPath"][1:])
+                        recordLog(
+                            'StaleElementReferenceException:loopElement+relativeXPath')
+                else:
+                    element = browser.find_element(
+                        By.XPATH, p["relativeXPath"])
+                    recordLog('StaleElementReferenceException:relativeXPath')
+                # 先处理特殊节点类型
+                if p["nodeType"] == 2:
+                    if element.get_attribute("href") != None:
+                        content = element.get_attribute("href")
+                    else:
+                        content = ""
+                elif p["nodeType"] == 3:
+                    if element.get_attribute("value") != None:
+                        content = element.get_attribute("value")
+                    else:
+                        content = ""
+                elif p["nodeType"] == 4:  # 图片
+                    if element.get_attribute("src") != None:
+                        content = element.get_attribute("src")
+                    else:
+                        content = ""
+                    try:
+                        downloadPic = p["downloadPic"]
+                    except:
+                        downloadPic = 0
+                    if downloadPic == 1:
+                        download_image(content, "Data/" +saveName + "/")
+                else: # 普通节点
+                    if p["contentType"] == 0:
+                        content = element.text
+                    elif p["contentType"] == 1:  # 只采集当期元素下的文本,不包括子元素
+                        command = 'var arr = [];\
+                        var content = arguments[0];\
+                        for(var i = 0, len = content.childNodes.length; i < len; i++) {\
+                            if(content.childNodes[i].nodeType === 3){  \
+                                arr.push(content.childNodes[i].nodeValue);\
+                            }\
                         }\
-                    }\
-                    var str = arr.join(" "); \
-                    return str;'
-                    content = browser.execute_script(command, element).replace(
-                        "\n", "").replace("\\s+", " ")
-                    if p["nodeType"] == 2:
-                        if element.get_attribute("href") != None:
-                            content = element.get_attribute("href")
-                        else:
-                            content = ""
-                    elif p["nodeType"] == 3:
-                        if element.get_attribute("value") != None:
-                            content = element.get_attribute("value")
-                        else:
-                            content = ""
-                    elif p["nodeType"] == 4:  # 图片
-                        if element.get_attribute("src") != None:
-                            content = element.get_attribute("src")
-                        else:
-                            content = ""
-                elif p["contentType"] == 0:
-                    content = element.text
-                    if p["nodeType"] == 2:
-                        if element.get_attribute("href") != None:
-                            content = element.get_attribute("href")
-                        else:
-                            content = ""
-                    elif p["nodeType"] == 3:
-                        if element.get_attribute("value") != None:
-                            content = element.get_attribute("value")
-                        else:
-                            content = ""
-                    elif p["nodeType"] == 4:  # 图片
-                        if element.get_attribute("src") != None:
-                            content = element.get_attribute("src")
-                        else:
-                            content = ""
+                        var str = arr.join(" "); \
+                        return str;'
+                        content = browser.execute_script(command, element).replace(
+                            "\n", "").replace("\\s+", " ")
+                    elif p["contentType"] == 2:
+                        content = element.get_attribute('innerHTML')
+                    elif p["contentType"] == 3:
+                        content = element.get_attribute('outerHTML')
+                    elif p["contentType"] == 4:
+                        # 获取元素的背景图片地址
+                        bg_url = element.value_of_css_property('background-image')
+                        # 清除背景图片地址中的多余字符
+                        bg_url = bg_url.replace('url("', '').replace('")', '')
+                        content = bg_url
+                    elif p["contentType"] == 5:
+                        content = browser.current_url
+                    elif p["contentType"] == 6:
+                        content = browser.title
+                    elif p["contentType"] == 7:
+                        # 获取整个网页的高度和宽度
+                        height = browser.execute_script("return document.body.scrollHeight");
+                        width = browser.execute_script("return document.body.scrollWidth");
+                        # 调整浏览器窗口的大小
+                        browser.set_window_size(width, height)
+                        element.screenshot("Data/" +saveName + "/"+ str(time.time()) + ".png")
+                    elif p["contentType"] == 8:
+                        try:
+                            screenshot = element.screenshot_as_png
+                            screenshot_stream = io.BytesIO(screenshot)
+                            # 使用Pillow库打开截图,并转换为灰度图像
+                            image = Image.open(screenshot_stream).convert('L')
+                            # 使用Tesseract OCR引擎识别图像中的文本
+                            text = pytesseract.image_to_string(image,  lang='chi_sim+eng')
+                            content = text
+                        except Exception as e:
+                            content = "OCR Error"
+                            print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
+                            print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中:https://blog.csdn.net/u010454030/article/details/80515501")
+                    elif p["contentType"] == 9:
+                        content = execute_code(2, p["JS"], p["JSWaitTime"], element)
             except StaleElementReferenceException:
                 recordLog('StaleElementReferenceException:'+p["relativeXPath"])
                 continue  # 再出现类似问题直接跳过
@@ -859,7 +878,7 @@ def isnull(s):
 
 def saveData(exit=False):
     global saveName, log, OUTPUT, browser
-    if exit == True or len(OUTPUT) > 100: # 每100条保存一次
+    if exit == True or len(OUTPUT) >= 100: # 每100条保存一次
         with open("Data/"+saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
             file_obj.write(log)
             file_obj.close()
@@ -890,6 +909,7 @@ if __name__ == '__main__':
         "config_folder": "",
         "config_file_name": "config.json",
         "headless": False,
+        "version": "0.3.0",
     }
     c = Config(config)
     print(c)

+ 2 - 0
Extension/manifest_v3/.gitignore

@@ -4,3 +4,5 @@ dist
 .env
 EasySpider_en
 EasySpider_zh
+EasySpider_en.crx
+EasySpider_zh.crx

TEMPAT SAMPAH
Extension/manifest_v3/EasySpider_en.crx


TEMPAT SAMPAH
Extension/manifest_v3/EasySpider_zh.crx


+ 1 - 1
Extension/manifest_v3/src/manifest.json

@@ -1,6 +1,6 @@
 {
   "name": "EasySpider",
-  "version": "0.2",
+  "version": "0.3.0",
   "description": "EasySpider's chrome extension",
   "author": "Naibo Wang",
   "manifest_version": 3,

+ 48 - 0
Releases/EasySpider_windows_amd64/V0.3.0 New Features.txt

@@ -0,0 +1,48 @@
+https://github.com/NaiboWang/EasySpider/releases/tag/v0.3.0
+
+### 强烈建议大家观看新特性讲解视频
+
+B站最新版特性视频已上传,新视频非常有用,推荐大家观看。
+
+[【重要】自定义条件判断之使用循环项内的JS命令返回值 - 第二弹](https://www.bilibili.com/video/BV1mu411x7Nn/)
+
+[如何执行自己写的JS代码和系统代码 (自定义操作)](https://www.bilibili.com/video/BV1qs4y1z7Hc/)
+
+[如何自定义循环和判断条件 - 第一弹](https://www.bilibili.com/video/BV1Ys4y1z777/)
+
+[如何对元素和网页截图及(无头模式)命令行执行指南](https://www.bilibili.com/video/BV1dV4y1z764/)
+
+[OCR识别元素内容功能](https://www.bilibili.com/video/BV1xz4y1b72D/)
+
+注意,v0.3.0版本任务task文件夹内`.json`文件和v0.2.0版本不兼容,请重新设计v0.3.0版本任务。
+
+## 更新说明
+1. 高级操作:
+ - 可以在任务流程中**执行自定义脚本**,包括在浏览器中**执行Javascript指令**以及**操作系统级别的脚本调用**并可**得到命令返回值并记录**,大大扩展了可操作空间。
+
+![image](https://github.com/NaiboWang/EasySpider/assets/30287768/06e63a06-328d-4339-b40b-2d57c94cee66)
+
+ - 在每一个操作执行前和执行后,都可以指定执行一段针对当前定位元素的JavaScript指令。
+ 
+<img src="https://github.com/NaiboWang/EasySpider/assets/30287768/dde64388-5668-40ff-951e-fb8f60655c49" height=50% width=50%> 
+
+2. **判断条件和循环条件**中同样增加了**执行自定义脚本**,并根据自定义脚本的返回值是否为真来作为条件判断和循环的判断条件,同样极大的增加了任务的可操作性。
+![image](https://github.com/NaiboWang/EasySpider/assets/30287768/9dea0564-1a1c-487d-9fa4-427c5e284796)
+3. 可同时生成多种XPath供用户选择,并**预装了XPath Helper扩展**供大家调试XPath。
+4. 增加采集元素背景图片地址,当前页面标题,当前页面URL地址功能。
+5. 增加保存元素截图功能,如要截图某元素或整个网页页面,可以用此功能(配合无头模式效果更好)。
+6. 增加下载图片功能(正式版,Beta版没有)。
+7. 增加OCR识别元素功能(使用此功能需首先自行安装Tesseract库:[https://blog.csdn.net/u010454030/article/details/80515501](https://blog.csdn.net/u010454030/article/details/80515501))
+8. 可直接提取对元素执行JavaScript代码后的返回值,实现如正则表达式,获得元素背景颜色等功能。
+<img src="https://github.com/NaiboWang/EasySpider/assets/30287768/f6a9b5ce-63c5-4348-8967-053c21d67ef9" width=50% height=50%>
+
+9. 大幅增加使用提示和说明,使软件更易用(如增加了iframe标签的处理方式说明,各个选项的参数意义,以及循环项XPath的修改说明等等)。
+10. 执行命令时增加了如何用命令行执行任务的提示:[https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction](https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction)。
+![image](https://github.com/NaiboWang/EasySpider/assets/30287768/a9e774df-e345-4d51-b7c9-2c4dac0ec624)
+11. 增加无头模式,即无浏览器界面模式配置。
+12. 修复了使用用户配置浏览器模式下的中文路径不能正确识别的问题。
+13. 修复了条件分支没有无条件分支时会卡死的问题。
+14. 修复了保存任务后会输入框卡死的问题。
+15. 打开网页操作和点击元素操作新增设置页面最长加载等待时间。
+16. 增加版本更新提示。
+17. 更新chrome版本为113。

+ 1 - 1
Releases/EasySpider_windows_amd64/config.json

@@ -1 +1 @@
-{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\Releases\\EasySpider_windows_amd64\\user_data1"}
+{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data12","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\Releases\\EasySpider_windows_amd64\\user_data1"}

File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/execution_instances/0.json


File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/execution_instances/1.json


File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/execution_instances/2.json


File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/execution_instances/3.json


File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/execution_instances/4.json


File diff ditekan karena terlalu besar
+ 0 - 0
Releases/EasySpider_windows_amd64/tasks/49.json


Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini