Browse Source

New Version Pre-Release

naibo 2 năm trước cách đây
mục cha
commit
76b9b10dc7

+ 2 - 2
.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py

@@ -41,7 +41,7 @@ import pytesseract
 from PIL import Image
 # import uuid
 from threading import Thread, Event
-from myChrome import MyChrome
+from myChrome import MyChrome, MyUCChrome
 from utils import check_pause, download_image, get_output_code, isnull, myMySQL, new_line, write_to_csv, write_to_excel
 desired_capabilities = DesiredCapabilities.CHROME
 desired_capabilities["pageLoadStrategy"] = "none"
@@ -1473,7 +1473,7 @@ if __name__ == '__main__':
             browser_t = MyChrome(
                 options=options, chrome_options=option, executable_path=driver_path)
         elif cloudflare == 1:
-            browser_t = uc.Chrome(
+            browser_t = MyUCChrome(
                 options=options, chrome_options=option, executable_path=driver_path)
             print("Pass Cloudflare Mode")
             print("过Cloudflare验证模式")

+ 76 - 2
.temp_to_pub/EasySpider_windows_x64/Code/myChrome.py

@@ -1,5 +1,3 @@
-
-
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.action_chains import ActionChains
@@ -14,10 +12,12 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 from selenium.webdriver.support.ui import Select
 from selenium.webdriver import ActionChains
 from selenium.webdriver.common.by import By
+import undetected_chromedriver as uc
 desired_capabilities = DesiredCapabilities.CHROME
 desired_capabilities["pageLoadStrategy"] = "none"
 
 
+
 class MyChrome(webdriver.Chrome):
 
     def __init__(self, *args, **kwargs):
@@ -89,3 +89,77 @@ class MyChrome(webdriver.Chrome):
                 raise NoSuchElementException
         else:
             return super().find_elements(by=by, value=value)
+        
+
+class MyUCChrome(uc.Chrome):
+
+    def __init__(self, *args, **kwargs):
+        self.iframe_env = False  # 现在的环境是root还是iframe
+        super().__init__(*args, **kwargs)  # 调用父类的 __init__
+
+    def find_element(self, by=By.ID, value=None, iframe=False):
+        # 在这里改变查找元素的行为
+        if self.iframe_env:
+            super().switch_to.default_content()
+            self.iframe_env = False
+        if iframe:
+            # 获取所有的 iframe
+            try:
+                iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
+            except Exception as e:
+                print(e)
+            find_element = False
+            # 遍历所有的 iframe 并点击里面的元素
+            for iframe in iframes:
+                # 切换到 iframe
+                super().switch_to.default_content()
+                super().switch_to.frame(iframe)
+                self.iframe_env = True
+                try:
+                    # 在 iframe 中查找并点击元素
+                    # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
+                    element = super().find_element(by=by, value=value)
+                    find_element = True
+                except:
+                    print("No such element found in the iframe")
+                # 完成操作后切回主文档
+                # super().switch_to.default_content()
+                if find_element:
+                    return element
+            if not find_element:
+                raise NoSuchElementException
+        else:
+            return super().find_element(by=by, value=value)
+
+    def find_elements(self, by=By.ID, value=None, iframe=False):
+        # 在这里改变查找元素的行为
+        if self.iframe_env:
+            super().switch_to.default_content()
+            self.iframe_env = False
+        if iframe:
+            # 获取所有的 iframe
+            iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
+            find_element = False
+            # 遍历所有的 iframe 并点击里面的元素
+            for iframe in iframes:
+                # 切换到 iframe
+                try:
+                    super().switch_to.default_content()
+                    super().switch_to.frame(iframe)
+                    self.iframe_env = True
+                    # 在 iframe 中查找并点击元素
+                    # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
+                    elements = super().find_elements(by=by, value=value)
+                    if len(elements) > 0:
+                        find_element = True
+                    # 完成操作后切回主文档
+                    # super().switch_to.default_content()
+                    if find_element:
+                        return elements
+                except:
+                    print("No such element found in the iframe")
+            if not find_element:
+                raise NoSuchElementException
+        else:
+            return super().find_elements(by=by, value=value)
+

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/0.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/1.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json


+ 0 - 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json

@@ -1 +0,0 @@
-{"id":6,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/8/2023, 7:54:10 AM","update_time":"7/8/2023, 7:54:10 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"text","exampleValue":"sadf<enter>","value":"sadf<enter>"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"sadf<enter>","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}}]}

+ 0 - 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json

@@ -1 +0,0 @@
-{"id":7,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/8/2023, 7:54:10 AM","update_time":"7/8/2023, 7:54:46 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"text","exampleValue":"sadf<enter>","value":"sadf<enter>"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","iframe":false,"wait":5,"waitType":"1","beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"sadf<enter>","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}}]}

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/109.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/114.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/115.json


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
.temp_to_pub/EasySpider_windows_x64/tasks/34.json


+ 12 - 8
.temp_to_pub/compress.py

@@ -47,10 +47,12 @@ if __name__ == "__main__":
         file_name = f"EasySpider_{easyspider_version}_windows_x64.7z"
         if os.path.exists("./EasySpider_windows_x64/user_data"):
             shutil.rmtree("./EasySpider_windows_x64/user_data")
-        shutil.rmtree("./EasySpider_windows_x64/Data")
-        shutil.rmtree("./EasySpider_windows_x64/config.json")
-        shutil.rmtree("./EasySpider_windows_x64/mysql_config.json")
-        shutil.rmtree("./EasySpider_windows_x64/execution_instances")
+        if os.path.exists("./EasySpider_windows_x64/Data"):
+            shutil.rmtree("./EasySpider_windows_x64/Data")
+        if os.path.exists("./EasySpider_windows_x64/execution_instances"):
+            shutil.rmtree("./EasySpider_windows_x64/execution_instances")
+        os.remove("./EasySpider_windows_x64/config.json")
+        os.remove("./EasySpider_windows_x64/mysql_config.json")
         os.mkdir("./EasySpider_windows_x64/Data")
         os.mkdir("./EasySpider_windows_x64/execution_instances")
         compress_folder_to_7z_split("./EasySpider_windows_x64", file_name)
@@ -61,10 +63,12 @@ if __name__ == "__main__":
         file_name = f"EasySpider_{easyspider_version}_windows_x86.7z"
         if os.path.exists("./EasySpider_windows_x86/user_data"):
             shutil.rmtree("./EasySpider_windows_x86/user_data")
-        shutil.rmtree("./EasySpider_windows_x86/Data")
-        shutil.rmtree("./EasySpider_windows_x86/execution_instances")
-        shutil.rmtree("./EasySpider_windows_x86/config.json")
-        shutil.rmtree("./EasySpider_windows_x86/mysql_config.json")
+        if os.path.exists("./EasySpider_windows_x86/Data"):
+            shutil.rmtree("./EasySpider_windows_x86/Data")
+        if os.path.exists("./EasySpider_windows_x86/execution_instances"):
+            shutil.rmtree("./EasySpider_windows_x86/execution_instances")
+        os.remove("./EasySpider_windows_x86/config.json")
+        os.remove("./EasySpider_windows_x86/mysql_config.json")
         os.mkdir("./EasySpider_windows_x86/Data")
         os.mkdir("./EasySpider_windows_x86/execution_instances")
         compress_folder_to_7z("./EasySpider_windows_x64", file_name)

+ 2 - 2
ElectronJS/src/taskGrid/FlowChart_CN.html

@@ -551,7 +551,7 @@
                     <input onkeydown="inputDelete(event)" required name="serviceName" value="新web采集任务" id="serviceName" class="form-control"></input>
                     <label>任务描述:</label>
                     <input onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
-                    <label>导出数据格式(Excel/CSV/TXT/数据库):</label>
+                    <label>导出数据格式(Excel/CSV/TXT/数据库,<a href="https://www.bilibili.com/video/BV1os4y1679S/" target="_blank">查看MySQL操作教程</a>):</label>
                     <select id="outputFormat" class="form-control">
                         <option value = "xlsx">XLSX(EXCEL)</option>
                         <option value = "csv">CSV</option>
@@ -560,7 +560,7 @@
                     </select>
                     <label>导出文件名/数据库表格名称(名称中的“current_time”会被替换为执行任务时的时间戳):</label>
                     <input onkeydown="inputDelete(event)" value="current_time" id="saveName" class="form-control"></input>
-                    <label>是否为cloudflare等极端反爬网站:</label>
+                    <label>是否为Cloudflare等极端反爬网站(<a href="https://www.bilibili.com/video/BV1Ph4y1E7R9/" target="_blank">查看Cloudflare设计和执行教程</a>):</label>
                     <select id="cloudflare" name="cloudflare" class="form-control">
                         <option value = 0>否</option>
                         <option value = 1>是</option>

+ 4 - 0
Readme.md

@@ -74,6 +74,10 @@ Bilibili/B站视频教程:
 
 [实例 - 反人类网站文章采集和代码调试](https://www.bilibili.com/video/BV11W4y1D71t/)
 
+[写入MySQL数据库教程](https://www.bilibili.com/video/BV1os4y1679S/)
+
+[Cloudflare等极端反爬网站如何爬取](https://www.bilibili.com/video/BV1Ph4y1E7R9/)
+
 Refer to [Youtube Playlist](https://youtube.com/playlist?list=PL0kEFEkWrT7mt9MUlEBV2DTo1QsaanUTp) to see the video tutorials of EasySpider.
 
 ## 样例任务/Sample Tasks

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác