Browse Source

修复循环内相对引用//XPATH的错误以及id()XPATH无法使用的错误

naibo 2 years ago
parent
commit
3230254f98

+ 1 - 1
ElectronJS/src/taskGrid/FlowChart.html

@@ -456,7 +456,7 @@
 
                     <div id="breakAdvanced" v-if='nowNode["parameters"]["loopType"] < 5'>
                         <div>
-                            <p><label>(Advanced Operation) Define loop exit condition using code/script:</label></p>
+                            <p><label>(Advanced Operation) Define loop exit condition using code/script; or you can add a <b>Custom Action</b>, then select the "Exit Loop" option:</label></p>
                             <select v-model='nowNode["parameters"]["breakMode"]' class="form-control" style="font-weight: bold">
                                 <option value=0>Do not set script (even if a script is written below, it will not be executed)</option>
                                 <option value=1>JavaScript script (start with 'return ')</option>

+ 2 - 2
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -451,12 +451,12 @@
                         <input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
                     </div>
                     <!-- 这里添加退出循环条件,找不到元素肯定退出循环 -->
-                    <label v-if='parseInt(loopType) == 0'>最多执行循环次数(0代表无限循环直到找不到元素或数据变化为止):</label>
+                    <label v-if='parseInt(loopType) == 0'>最多执行循环次数(0代表无限循环直到找不到元素或检测不到页面内容变化为止):</label>
                     <input onkeydown="inputDelete(event)" required v-if='parseInt(loopType) == 0' class="form-control" type="number" v-model.number='nowNode["parameters"]["exitCount"]'></input>
 
                     <div id="breakAdvanced" v-if='nowNode["parameters"]["loopType"] < 5'>
                         <div>
-                            <p><label>(高级操作)使用代码/脚本定义循环退出条件(也可以在流程中添加自定义操作,然后选择Break选项): </label></p>
+                            <p><label>(高级操作)使用代码/脚本定义循环退出条件(也可以在流程中添加<b>自定义操作</b>,然后选择<b>退出循环</b>选项): </label></p>
                             <select v-model='nowNode["parameters"]["breakMode"]' class="form-control" style="font-weight: bold">
                                 <option value = 0>不设置脚本(选择这个下面写了脚本也不会执行)</option>
                                 <option value = 1>JavaScript脚本返回值(需以return 开头)</option>

File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/158.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/159.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/162.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/163.json


+ 1 - 0
ElectronJS/tasks/164.json

@@ -0,0 +1 @@
+{"id":164,"name":"Just a moment...","url":"https://turnstile.zeroclover.io/","links":"https://turnstile.zeroclover.io/","create_time":"","update_time":"7/12/2023, 5:36:24 AM","version":"0.3.5","saveThreshold":10,"cloudflare":1,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://turnstile.zeroclover.io/","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://turnstile.zeroclover.io/","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://turnstile.zeroclover.io/"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://turnstile.zeroclover.io/","links":"https://turnstile.zeroclover.io/","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"feedback-form\")]/input[2]","iframe":false,"wait":20,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":["/html/body/div[1]/form[1]/input[2]","//input[contains(., '')]","/html/body/div[last()-3]/form/input"]}}]}

File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/165.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/4.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/49.json


+ 1 - 1
ExecuteStage/.vscode/launch.json

@@ -12,7 +12,7 @@
             "justMyCode": false,
             //  "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
             // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--id", "[90]", "--headless", "0", "--user_data", "1"]
+            "args": ["--id", "[3]", "--headless", "0", "--user_data", "1"]
         }
     ]
 }

+ 33 - 18
ExecuteStage/easyspider_executestage.py

@@ -1221,29 +1221,42 @@ class BrowserThread(Thread):
                     # p["relativeXPath"] = p["relativeXPath"].lower()
                     # p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
                     # 已经有text()或@href了,不需要再加
+                    content_type = ""
                     if p["relativeXPath"].find("/@href") >= 0 or p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
-                        xpath = p["relativeXPath"]
+                        content_type = ""
                     elif p["nodeType"] == 2:
-                        xpath = p["relativeXPath"] + "/@href"
+                        content_type = "/@href"
                     elif p["contentType"] == 1:
-                        xpath = p["relativeXPath"] + "/text()"
+                        content_type = "/text()"
                     elif p["contentType"] == 0:
-                        xpath = p["relativeXPath"] + "//text()"
+                        content_type = "//text()"
+                    xpath = p["relativeXPath"] + content_type
                     if p["relative"]:
                         # if p["relativeXPath"] == "":
                         #     content = [loopElementHTML]
                         # else:
                         # 如果字串里有//即子孙查找,则不动语句
                         if p["relativeXPath"].find("//") >= 0:
-                            full_path = "(" + parentPath + \
-                                xpath + ")" + \
-                                "[" + str(index + 1) + "]"
-                            content = pageHTML.xpath(full_path)
+                            if xpath.startswith("/"): 
+                                full_path = "(" + parentPath  + ")" + \
+                                        "[" + str(index + 1) + "]"+ \
+                                        p["relativeXPath"] + content_type
+                            else: # 如果是id()这种形式,不需要包parentPath
+                                full_path = xpath
+                            try:
+                                content = pageHTML.xpath(full_path)
+                            except:
+                                content = []
+                        elif not p["relativeXPath"].startswith("/"): # 如果是id()这种形式,不需要包/html/body
+                            try:
+                                content = loopElementHTML.xpath(xpath)
+                            except:
+                                content = []
                         else:
                             content = loopElementHTML.xpath(
                                 "/html/body/" + loopElementHTML[0][0].tag + xpath)
                     else:
-                        if xpath.find("/body") < 0:
+                        if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
                             xpath = "/html/body" + xpath
                         content = pageHTML.xpath(xpath)
                     if len(content) > 0:
@@ -1289,9 +1302,12 @@ class BrowserThread(Thread):
                             else:
                                 # 如果字串里有//即子孙查找,则不动语句
                                 if p["relativeXPath"].find("//") >= 0:
-                                    full_path = "(" + parentPath + \
-                                        p["relativeXPath"] + ")" + \
-                                        "[" + str(index + 1) + "]"
+                                    # full_path = "(" + parentPath + \
+                                    #     p["relativeXPath"] + ")" + \
+                                    #     "[" + str(index + 1) + "]"
+                                    full_path = "(" + parentPath + ")" + \
+                                        "[" + str(index + 1) + "]" + \
+                                        p["relativeXPath"]
                                     element = self.browser.find_element(
                                         By.XPATH, full_path, iframe=p["iframe"])
                                 else:
@@ -1462,10 +1478,8 @@ if __name__ == '__main__':
 
     option.add_experimental_option(
         'excludeSwitches', ['enable-automation'])  # 以开发者模式
-    options.add_argument('-ignore-certificate-errors')
-    options.add_argument('-ignore -ssl-errors')
-    option.add_argument('-ignore-certificate-errors')
-    option.add_argument('-ignore -ssl-errors')
+    options.add_argument('log-level=3')  # 隐藏日志
+    option.add_argument('log-level=3')  # 隐藏日志
     # user_data_dir = r''  # 注意没有Default!
 
     # options.add_argument('--user-data-dir='+p)
@@ -1559,8 +1573,6 @@ if __name__ == '__main__':
             if sys.platform != "darwin":
                 browser_t = MyUCChrome(
                 options=options, chrome_options=option, driver_executable_path=driver_path)
-                print("Pass Cloudflare Mode")
-                print("过Cloudflare验证模式")
             else:
                 print("Not support Cloudflare Mode on MacOS")
                 print("MacOS不支持Cloudflare验证模式")
@@ -1587,6 +1599,9 @@ if __name__ == '__main__':
         print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
         print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
         print("----------------------------------\n\n")
+        if cloudflare:
+            print("过Cloudflare验证模式有时候会不稳定,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
+            print("Passing Cloudflare verification mode is sometimes unstable, if you cannot pass the verification, you need to try again every few minutes, or you can change a new user information folder and then execute the task.")
         # 使用监听器监听键盘输入
         try:
             with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:

+ 1 - 1
ExecuteStage/undetected_chromedriver_ES/__init__.py

@@ -451,7 +451,7 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
             service = None
 
         super(Chrome, self).__init__(
-            executable_path=driver_executable_path,
+            executable_path=self.patcher.executable_path,
             port=port,
             options=options,
             service_args=service_args,

+ 12 - 6
ExecuteStage/undetected_chromedriver_ES/patcher.py

@@ -116,18 +116,24 @@ class Patcher(object):
         #     # -1 being a skip value used later in this block
         #
         p = pathlib.Path(self.data_path)
-        with Lock():
-            files = list(p.rglob("*chromedriver*?"))
-            for file in files:
-                if self.is_binary_patched(file):
-                    self.executable_path = str(file)
-                    return True
+        # with Lock():
+        #     files = list(p.rglob("*chromedriver*?"))
+        #     for file in files:
+        #         if self.is_binary_patched(file):
+        #             self.executable_path = str(file)
+        #             return True
 
         if executable_path:
             self.executable_path = executable_path
             self._custom_exe_path = True
 
         if self._custom_exe_path:
+            file_name, file_extension = os.path.splitext(self.executable_path)
+            # 创建新的文件名
+            new_file = f"{file_name}_uc{file_extension}"
+            if not os.path.exists(new_file):
+                shutil.copy(self.executable_path, new_file)
+            self.executable_path = new_file # 用新的chromedriver
             ispatched = self.is_binary_patched(self.executable_path)
             if not ispatched:
                 return self.patch_exe()

+ 11 - 0
Readme.md

@@ -32,6 +32,12 @@ A visual code-free/no-code web crawler/spider, just select the content you want
 
 ![animation_en](media/animation_en.gif)
 
+### 更多特性/More Features
+
+更多特性请翻到页面底部查看。
+
+More features please scroll to the bottom of this page to view.
+
 ## 下载易采集/Download EasySpider
 
 进入 [Releases Page](https://github.com/NaiboWang/EasySpider/releases) 下载最新版本。如果下载速度慢,可以考虑中国境内下载地址:[中国境内下载地址](https://www.easyspider.cn/download.html)。
@@ -144,6 +150,11 @@ At the same time, the software is protected by patent rights. If you want to use
 
 Refer to [Compilation Instructions](ElectronJS/README.md).
 
+## 支持特性/Supported Features
+
+![pic](media/features_CN.png)
+![pic](media/features_EN.png)
+
 ## 中文界面截图
 
 #### 软件界面示例

BIN
media/features_CN.png


BIN
media/features_EN.png


Some files were not shown because too many files changed in this diff