Browse Source

Iframe Nested

naibo 1 year ago
parent
commit
c3773848c3

File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/145.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/146.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/147.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/148.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/149.json


File diff suppressed because it is too large
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/238.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/230.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/231.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/232.json


File diff suppressed because it is too large
+ 0 - 0
ElectronJS/tasks/233.json


+ 2 - 2
ExecuteStage/.vscode/launch.json

@@ -12,8 +12,8 @@
             "justMyCode": false,
             //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
             // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            // "args": ["--ids", "[1]", "--headless", "0", "--user_data", "1", "--keyboard", "1"]
-            "args": "--ids '[3]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
+            "args": ["--ids", "[149]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
+            // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
         }
     ]
 }

+ 4 - 71
ExecuteStage/easyspider_executestage.py

@@ -2,6 +2,7 @@
 # import atexit
 import atexit
 import copy
+import platform
 import shutil
 import string
 import undetected_chromedriver as uc
@@ -1711,6 +1712,7 @@ class BrowserThread(Thread):
                         p["relativeXPath"], self.outputParameters, self)
                     # 只有当前环境不变变化才可以快速提取数据
                     if self.browser.iframe_env != p["iframe"]:
+                    # if p["iframe"] or self.browser.iframe_env != p["iframe"]: # 如果是iframe,则不能快速提取数据,主要是各个上下文的iframe切换,但一般不会有人这么做
                         p["optimizable"] = False
                         continue
                     # relativeXPath = relativeXPath.lower()
@@ -1820,7 +1822,7 @@ class BrowserThread(Thread):
                             element = self.browser.find_element(
                                 By.XPATH, relativeXPath, iframe=p["iframe"])
                     except (
-                    NoSuchElementException, InvalidSelectorException, StaleElementReferenceException):  # 找不到元素的时候,使用默认值
+                    NoSuchElementException, InvalidSelectorException, StaleElementReferenceException) as e:  # 找不到元素的时候,使用默认值
                         # self.print_and_log(p)
                         try:
                             content = p["default"]
@@ -1835,6 +1837,7 @@ class BrowserThread(Thread):
                                 self.print_and_log(
                                     "提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
                                         p["name"], relativeXPath))
+                                self.dataNotFoundKeys[p["name"]] = True
                         except:
                             pass
                         continue
@@ -1916,92 +1919,57 @@ if __name__ == '__main__':
     print(c)
     options = webdriver.ChromeOptions()
     driver_path = "chromedriver.exe"
-    import platform
-
     print(sys.platform, platform.architecture())
-    # option = webdriver.ChromeOptions()
     if not os.path.exists(os.getcwd() + "/Data"):
         os.mkdir(os.getcwd() + "/Data")
     if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
         options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
-        # MacOS需要用option而不是options!
-        # option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
-        # option.add_extension(
-            # "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
         options.add_extension(
             "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
         driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
-        # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
-        # # MacOS需要用option而不是options!
-        # option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
-        # driver_path = os.getcwd()+ "/chromedriver_mac64"
         print(driver_path)
         if c.config_folder == "":
             c.config_folder = os.path.expanduser(
                 "~/Library/Application Support/EasySpider/")
-        # print("Config folder for MacOS:", c.config_folder)
     elif os.path.exists(os.getcwd() + "/EasySpider/resources"):  # 打包后的路径
         print("Finding chromedriver in EasySpider",
               os.getcwd() + "/EasySpider")
         if sys.platform == "win32" and platform.architecture()[0] == "32bit":
             options.binary_location = os.path.join(
                 os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe")  # 指定chrome位置
-            # option.binary_location = os.path.join(
-            #     os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe")  # 指定chrome位置
             driver_path = os.path.join(
                 os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
-            # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
             options.add_extension("EasySpider/resources/app/XPathHelper.crx")
         elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
             options.binary_location = os.path.join(
                 os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
-            # option.binary_location = os.path.join(
-                # os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
             driver_path = os.path.join(
                 os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
-            # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
             options.add_extension("EasySpider/resources/app/XPathHelper.crx")
         elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
             options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
-            # option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
             driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
-            # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
             options.add_extension("EasySpider/resources/app/XPathHelper.crx")
         else:
             print("Unsupported platform")
             sys.exit()
         print("Chrome location:", options.binary_location)
         print("Chromedriver location:", driver_path)
-    # elif os.getcwd().find("ExecuteStage") >= 0:  # 如果直接执行
-    #     print("Finding chromedriver in ./Chrome",
-    #           os.getcwd()+"/Chrome")
-    #     options.binary_location = "./Chrome/chrome.exe"  # 指定chrome位置
-    #     # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
-    #     driver_path = "./Chrome/chromedriver.exe"
     elif os.path.exists(os.getcwd() + "/../ElectronJS"):
         # 软件dev用
         print("Finding chromedriver in EasySpider",
               os.getcwd() + "/ElectronJS")
-        # option.binary_location = "../ElectronJS/chrome_win64/chrome.exe"  # 指定chrome位置
         options.binary_location = "../ElectronJS/chrome_win64/chrome.exe"  # 指定chrome位置
         driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
-        # option.add_extension("../ElectronJS/XPathHelper.crx")
         options.add_extension("../ElectronJS/XPathHelper.crx")
     else:
         options.binary_location = "./chrome.exe"  # 指定chrome位置
-        # option.binary_location = "./chrome.exe"  # 指定chrome位置
         driver_path = "./chromedriver.exe"
-        # option.add_extension("XPathHelper.crx")
         options.add_extension("XPathHelper.crx")
 
-    # option.add_experimental_option(
-        # 'excludeSwitches', ['enable-automation'])  # 以开发者模式
     options.add_experimental_option(
         'excludeSwitches', ['enable-automation'])  # 以开发者模式
 
-    # user_data_dir = r''  # 注意没有Default!
-
-    # options.add_argument('--user-data-dir='+p)
 
     # 总结:
     # 0. 带Cookie需要用userdatadir
@@ -2018,22 +1986,15 @@ if __name__ == '__main__':
     except:
         pass
 
-    # options.add_argument(
-    #     '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data')  # TMALL 反扒
-    # option.add_argument(
-        # "--disable-blink-features=AutomationControlled")  # TMALL 反扒
     options.add_argument(
         "--disable-blink-features=AutomationControlled")  # TMALL 反扒
 
     options.add_argument('-ignore-certificate-errors')
     options.add_argument('-ignore -ssl-errors')
-    # option.add_argument('-ignore-certificate-errors')
-    # option.add_argument('-ignore -ssl-errors')
 
     if c.headless:
         print("Headless mode")
         print("无头模式")
-        # option.add_argument("--headless")
         options.add_argument("--headless")
 
     tmp_options = []
@@ -2058,11 +2019,7 @@ if __name__ == '__main__':
             shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
             print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
             print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
-            # option = tmp_options[i]["option"]
             options = tmp_options[i]["options"]
-            # option.add_argument(
-                # f'--user-data-dir={tmp_user_data_folder}')  # TMALL 反扒
-            # option.add_argument("--profile-directory=Default")
             options.add_argument(
                 f'--user-data-dir={tmp_user_data_folder}')  # TMALL 反扒
             options.add_argument("--profile-directory=Default")
@@ -2074,7 +2031,6 @@ if __name__ == '__main__':
     threads = []
     for i in range(len(c.ids)):
         id = c.ids[i]
-        # option = tmp_options[i]["option"]
         options = tmp_options[i]["options"]
         print("id: ", id)
         if c.read_type == "remote":
@@ -2100,7 +2056,6 @@ if __name__ == '__main__':
             cloudflare = 0
         if cloudflare == 0:
             options.add_argument('log-level=3')  # 隐藏日志
-            # option.add_argument('log-level=3')  # 隐藏日志
             path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
             print("Data path:", path)
             options.add_experimental_option("prefs", {
@@ -2116,37 +2071,17 @@ if __name__ == '__main__':
                 'safebrowsing.disable_download_protection': True,
                 'profile.default_content_settings.popups': 0,
             })
-            # option.add_experimental_option("prefs", {
-            #     # 设置文件下载路径
-            #     "download.default_directory": path,
-            #     "download.prompt_for_download": False,  # 禁止下载提示框
-            #     "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
-            #     "download.directory_upgrade": True,
-            #     "download.extensions_to_open": "applications/pdf",
-            #     "plugins.always_open_pdf_externally": True,  # 总是在外部程序中打开PDF
-            #     "safebrowsing_for_trusted_sources_enabled": False,
-            #     "safebrowsing.enabled": False,
-            #     'safebrowsing.enabled': False,
-            #     'safebrowsing.disable_download_protection': True,
-            #     'profile.default_content_settings.popups': 0,
-            # })
             try:
                 if service["environment"] == 1:
-                    # option.add_experimental_option(
-                        # 'mobileEmulation', {'deviceName': 'iPhone X'})  # 模拟iPhone X浏览
                     options.add_experimental_option(
                         'mobileEmulation', {'deviceName': 'iPhone X'})  # 模拟iPhone X浏览
             except:
                 pass
-            # browser_t = MyChrome(
-                # options=options, chrome_options=option, executable_path=driver_path)
             selenium_service = Service(executable_path=driver_path)
             browser_t = MyChrome(service=selenium_service, options=options)
         elif cloudflare == 1:
             if sys.platform == "win32":
                 options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe"  # 需要用自己的浏览器
-                # options.add_argument("--auto-open-devtools-for-tabs")
-                # options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"  # 需要用自己的浏览器
                 browser_t = MyUCChrome(
                     options=options, driver_executable_path=driver_path)
                 links = list(filter(isnotnull, service["links"].split("\n")))
@@ -2200,8 +2135,6 @@ if __name__ == '__main__':
         # print("您的操作系统不支持暂停功能。")
         # print("Your operating system does not support the pause function.")
 
-    # print("线程长度:", len(threads) )
-
     for thread in threads:
         print()
         thread.join()

+ 1 - 1
ExecuteStage/generateEXE_win64.cmd

@@ -1,6 +1,6 @@
 rmdir /s /q build
 rmdir /s /q dist
 @REM pyinstaller -F --icon=favicon.ico easyspider_executestage.py
-pyinstaller -F --icon=favicon.ico --add-data "C:\Python311\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Python311\Lib\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
+pyinstaller -F --icon=favicon.ico --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
 del ..\ElectronJS\chrome_win64\easyspider_executestage.exe
 copy dist\easyspider_executestage.exe ..\ElectronJS\chrome_win64\easyspider_executestage.exe

+ 136 - 61
ExecuteStage/myChrome.py

@@ -25,75 +25,150 @@ class MyChrome(webdriver.Chrome):
         self.iframe_env = False  # 现在的环境是root还是iframe
         super().__init__(*args, **kwargs)  # 调用父类的 __init__
 
-    def find_element(self, by=By.ID, value=None, iframe=False):
-        # 在这里改变查找元素的行为
-        if self.iframe_env:
-            super().switch_to.default_content()
-            self.iframe_env = False
-        if iframe:
-            # 获取所有的 iframe
+    # def find_element(self, by=By.ID, value=None, iframe=False):
+    #     # 在这里改变查找元素的行为
+    #     if self.iframe_env:
+    #         super().switch_to.default_content()
+    #         self.iframe_env = False
+    #     if iframe:
+    #         # 获取所有的 iframe
+    #         try:
+    #             iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
+    #         except Exception as e:
+    #             print(e)
+    #         find_element = False
+    #         # 遍历所有的 iframe 并查找里面的元素
+    #         for iframe in iframes:
+    #             # 切换到 iframe
+    #             super().switch_to.default_content()
+    #             super().switch_to.frame(iframe)
+    #             self.iframe_env = True
+    #             try:
+    #                 # 在 iframe 中查找元素
+    #                 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
+    #                 element = super().find_element(by=by, value=value)
+    #                 find_element = True
+    #             except NoSuchElementException as e:
+    #                 print(f"No such element found in the iframe: {str(e)}")
+    #             except Exception as e:
+    #                 print(f"Exception: {str(e)}")
+    #             # 完成操作后切回主文档
+    #             # super().switch_to.default_content()
+    #             if find_element:
+    #                 return element
+    #         if not find_element:
+    #             raise NoSuchElementException
+    #     else:
+    #         return super().find_element(by=by, value=value)
+
+    def find_element_recursive(self, by, value, frames):
+        for frame in frames:
             try:
-                iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
-            except Exception as e:
-                print(e)
-            find_element = False
-            # 遍历所有的 iframe 并查找里面的元素
-            for iframe in iframes:
-                # 切换到 iframe
-                super().switch_to.default_content()
-                super().switch_to.frame(iframe)
-                self.iframe_env = True
                 try:
-                    # 在 iframe 中查找元素
-                    # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
-                    element = super().find_element(by=by, value=value)
-                    find_element = True
-                except NoSuchElementException as e:
-                    print(f"No such element found in the iframe: {str(e)}")
-                except Exception as e:
-                    print(f"Exception: {str(e)}")
-                # 完成操作后切回主文档
-                # super().switch_to.default_content()
-                if find_element:
+                    self.switch_to.frame(frame)
+                except StaleElementReferenceException:
+                    # If the frame has been refreshed, we need to switch to the parent frame first,
+                    self.switch_to.parent_frame()
+                    self.switch_to.frame(frame)
+                try:
+                    # !!! Attempt to find the element in the current frame, not the context (iframe environment will not change to default), therefore we use super().find_element instead of self.find_element
+                    element = super(MyChrome, self).find_element(by=by, value=value)
                     return element
-            if not find_element:
-                raise NoSuchElementException
-        else:
-            return super().find_element(by=by, value=value)
+                except NoSuchElementException:
+                    # Recurse into nested iframes
+                    nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
+                    if nested_frames:
+                        element = self.find_element_recursive(by, value, nested_frames)
+                        if element:
+                            return element
+            except Exception as e:
+                print(f"Exception while processing frame: {e}")
 
-    def find_elements(self, by=By.ID, value=None, iframe=False):
-        # 在这里改变查找元素的行为
-        if self.iframe_env:
-            super().switch_to.default_content()
-            self.iframe_env = False
+        raise NoSuchElementException(f"Element {value} not found in any frame or iframe")
+
+    def find_element(self, by=By.ID, value=None, iframe=False):
+        self.switch_to.default_content()  # Switch back to the main document
+        self.iframe_env = False
         if iframe:
-            # 获取所有的 iframe
-            iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
-            find_element = False
-            # 遍历所有的 iframe 并找到里面的元素
-            for iframe in iframes:
-                # 切换到 iframe
+            frames = self.find_elements(By.CSS_SELECTOR, "iframe")
+            if not frames:
+                raise NoSuchElementException(f"No iframes found in the current page while searching for {value}")
+            self.iframe_env = True
+            return self.find_element_recursive(by, value, frames)
+        else:
+            # Find element in the main document as normal
+            return super(MyChrome, self).find_element(by=by, value=value)
+
+    # def find_elements(self, by=By.ID, value=None, iframe=False):
+    #     # 在这里改变查找元素的行为
+    #     if self.iframe_env:
+    #         super().switch_to.default_content()
+    #         self.iframe_env = False
+    #     if iframe:
+    #         # 获取所有的 iframe
+    #         iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
+    #         find_element = False
+    #         # 遍历所有的 iframe 并找到里面的元素
+    #         for iframe in iframes:
+    #             # 切换到 iframe
+    #             try:
+    #                 super().switch_to.default_content()
+    #                 super().switch_to.frame(iframe)
+    #                 self.iframe_env = True
+    #                 # 在 iframe 中查找元素
+    #                 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
+    #                 elements = super().find_elements(by=by, value=value)
+    #                 if len(elements) > 0:
+    #                     find_element = True
+    #                 # 完成操作后切回主文档
+    #                 # super().switch_to.default_content()
+    #                 if find_element:
+    #                     return elements
+    #             except NoSuchElementException as e:
+    #                 print(f"No such element found in the iframe: {str(e)}")
+    #             except Exception as e:
+    #                 print(f"Exception: {str(e)}")
+    #         if not find_element:
+    #             raise NoSuchElementException
+    #     else:
+    #         return super().find_elements(by=by, value=value)
+
+    def find_elements_recursive(self, by, value, frames):
+        for frame in frames:
+            try:
                 try:
-                    super().switch_to.default_content()
-                    super().switch_to.frame(iframe)
-                    self.iframe_env = True
-                    # 在 iframe 中查找元素
-                    # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
-                    elements = super().find_elements(by=by, value=value)
-                    if len(elements) > 0:
-                        find_element = True
-                    # 完成操作后切回主文档
-                    # super().switch_to.default_content()
-                    if find_element:
+                    self.switch_to.frame(frame)
+                except StaleElementReferenceException:
+                    # If the frame has been refreshed, we need to switch to the parent frame first,
+                    self.switch_to.parent_frame()
+                    self.switch_to.frame(frame)
+                # Directly find elements in the current frame
+                elements = super(MyChrome, self).find_elements(by=by, value=value)
+                if elements:
+                    return elements
+                # Recursively search for elements in nested iframes
+                nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
+                if nested_frames:
+                    elements = self.find_elements_recursive(by, value, nested_frames)
+                    if elements:
                         return elements
-                except NoSuchElementException as e:
-                    print(f"No such element found in the iframe: {str(e)}")
-                except Exception as e:
-                    print(f"Exception: {str(e)}")
-            if not find_element:
-                raise NoSuchElementException
+            except Exception as e:
+                print(f"Exception while processing frame: {e}")
+
+        raise NoSuchElementException(f"Elements with {value} not found in any frame or iframe")
+
+    def find_elements(self, by=By.ID, value=None, iframe=False):
+        self.switch_to.default_content()  # Switch back to the main document
+        self.iframe_env = False
+        if iframe:
+            frames = self.find_elements(By.CSS_SELECTOR, "iframe")
+            if not frames:
+                return []  # Return an empty list if no iframes are found
+            self.iframe_env = True
+            return self.find_elements_recursive(by, value, frames)
         else:
-            return super().find_elements(by=by, value=value)
+            # Find elements in the main document as normal
+            return super(MyChrome, self).find_elements(by=by, value=value)
 
 # MacOS不支持直接打包带Cloudflare的功能,如果要自己编译运行,可以把这个if去掉,然后配置好浏览器和driver路径
 if sys.platform != "darwin": 

+ 2 - 2
Extension/manifest_v3/package-lock.json

@@ -1,12 +1,12 @@
 {
     "name": "EasySpider",
-    "version": "0.5.0",
+    "version": "0.6.0",
     "lockfileVersion": 3,
     "requires": true,
     "packages": {
         "": {
             "name": "EasySpider",
-            "version": "0.5.0",
+            "version": "0.6.0",
             "license": "AGPL-3.0",
             "dependencies": {
                 "crx": "^5.0.1",

+ 1 - 1
Extension/manifest_v3/src/content-scripts/config.json

@@ -1 +1 @@
-{"language":"zh"}
+{"language":"en"}

Some files were not shown because too many files changed in this diff