|
@@ -266,18 +266,40 @@ class BrowserThread(Thread):
|
|
|
scrollType = int(para["scrollType"])
|
|
|
try:
|
|
|
if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
|
|
|
- for i in range(para["scrollCount"]):
|
|
|
- self.Log("Wait for set second after screen scrolling")
|
|
|
- body = self.browser.find_element(
|
|
|
- By.CSS_SELECTOR, "body", iframe=para["iframe"])
|
|
|
- if scrollType == 1:
|
|
|
- body.send_keys(Keys.PAGE_DOWN)
|
|
|
- elif scrollType == 2:
|
|
|
+ if scrollType == 1 or scrollType == 2:
|
|
|
+ for i in range(para["scrollCount"]):
|
|
|
+ self.Log("Wait for set second after screen scrolling")
|
|
|
+ body = self.browser.find_element(
|
|
|
+ By.CSS_SELECTOR, "body", iframe=para["iframe"])
|
|
|
+ if scrollType == 1:
|
|
|
+ body.send_keys(Keys.PAGE_DOWN)
|
|
|
+ elif scrollType == 2:
|
|
|
+ body.send_keys(Keys.END)
|
|
|
+ try:
|
|
|
+ time.sleep(para["scrollWaitTime"]) # 下拉完等待
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ elif scrollType == 3:
|
|
|
+ bodyText = ""
|
|
|
+ i = 0
|
|
|
+ while True:
|
|
|
+ newBodyText = self.browser.page_source
|
|
|
+ if newBodyText == bodyText:
|
|
|
+ print("页面已检测不到新内容,停止滚动。")
|
|
|
+ print("No new content detected on the page, stop scrolling.")
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ bodyText = newBodyText
|
|
|
+ body = self.browser.find_element(
|
|
|
+ By.CSS_SELECTOR, "body", iframe=para["iframe"])
|
|
|
body.send_keys(Keys.END)
|
|
|
- try:
|
|
|
- time.sleep(para["scrollWaitTime"]) # 下拉完等待
|
|
|
- except:
|
|
|
- pass
|
|
|
+ print("滚动到底部,第", i + 1, "次。")
|
|
|
+ print("Scroll to the bottom, the", i + 1, "time.")
|
|
|
+ i = i + 1
|
|
|
+ try:
|
|
|
+ time.sleep(para["scrollWaitTime"]) # 下拉完等待
|
|
|
+ except:
|
|
|
+ pass
|
|
|
except:
|
|
|
self.Log('Time out after set seconds when scrolling. ')
|
|
|
self.recordLog('Time out after set seconds when scrolling')
|
|
@@ -589,9 +611,18 @@ class BrowserThread(Thread):
|
|
|
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
|
|
|
# 无跳转标签页操作
|
|
|
count = 0 # 执行次数
|
|
|
+ bodyText = "-"
|
|
|
while True: # do while循环
|
|
|
try:
|
|
|
finished = False
|
|
|
+ newBodyText = self.browser.page_source
|
|
|
+ if newBodyText == bodyText: # 如果页面内容无变化
|
|
|
+ print("页面已检测不到新内容,停止循环。")
|
|
|
+ print("No new content detected on the page, stop loop.")
|
|
|
+ finished = True
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ bodyText = newBodyText
|
|
|
element = self.browser.find_element(
|
|
|
By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
|
|
|
for i in node["sequence"]: # 挨个执行操作
|
|
@@ -1190,29 +1221,42 @@ class BrowserThread(Thread):
|
|
|
# p["relativeXPath"] = p["relativeXPath"].lower()
|
|
|
# p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
|
|
|
# 已经有text()或@href了,不需要再加
|
|
|
+ content_type = ""
|
|
|
if p["relativeXPath"].find("/@href") >= 0 or p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
|
|
|
- xpath = p["relativeXPath"]
|
|
|
+ content_type = ""
|
|
|
elif p["nodeType"] == 2:
|
|
|
- xpath = p["relativeXPath"] + "/@href"
|
|
|
+ content_type = "/@href"
|
|
|
elif p["contentType"] == 1:
|
|
|
- xpath = p["relativeXPath"] + "/text()"
|
|
|
+ content_type = "/text()"
|
|
|
elif p["contentType"] == 0:
|
|
|
- xpath = p["relativeXPath"] + "//text()"
|
|
|
+ content_type = "//text()"
|
|
|
+ xpath = p["relativeXPath"] + content_type
|
|
|
if p["relative"]:
|
|
|
# if p["relativeXPath"] == "":
|
|
|
# content = [loopElementHTML]
|
|
|
# else:
|
|
|
# 如果字串里有//即子孙查找,则不动语句
|
|
|
if p["relativeXPath"].find("//") >= 0:
|
|
|
- full_path = "(" + parentPath + \
|
|
|
- xpath + ")" + \
|
|
|
- "[" + str(index + 1) + "]"
|
|
|
- content = pageHTML.xpath(full_path)
|
|
|
+ if xpath.startswith("/"):
|
|
|
+ full_path = "(" + parentPath + ")" + \
|
|
|
+ "[" + str(index + 1) + "]"+ \
|
|
|
+ p["relativeXPath"] + content_type
|
|
|
+ else: # 如果是id()这种形式,不需要包parentPath
|
|
|
+ full_path = xpath
|
|
|
+ try:
|
|
|
+ content = pageHTML.xpath(full_path)
|
|
|
+ except:
|
|
|
+ content = []
|
|
|
+ elif not p["relativeXPath"].startswith("/"): # 如果是id()这种形式,不需要包/html/body
|
|
|
+ try:
|
|
|
+ content = loopElementHTML.xpath(xpath)
|
|
|
+ except:
|
|
|
+ content = []
|
|
|
else:
|
|
|
content = loopElementHTML.xpath(
|
|
|
"/html/body/" + loopElementHTML[0][0].tag + xpath)
|
|
|
else:
|
|
|
- if xpath.find("/body") < 0:
|
|
|
+ if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
|
|
|
xpath = "/html/body" + xpath
|
|
|
content = pageHTML.xpath(xpath)
|
|
|
if len(content) > 0:
|
|
@@ -1258,9 +1302,12 @@ class BrowserThread(Thread):
|
|
|
else:
|
|
|
# 如果字串里有//即子孙查找,则不动语句
|
|
|
if p["relativeXPath"].find("//") >= 0:
|
|
|
- full_path = "(" + parentPath + \
|
|
|
- p["relativeXPath"] + ")" + \
|
|
|
- "[" + str(index + 1) + "]"
|
|
|
+ # full_path = "(" + parentPath + \
|
|
|
+ # p["relativeXPath"] + ")" + \
|
|
|
+ # "[" + str(index + 1) + "]"
|
|
|
+ full_path = "(" + parentPath + ")" + \
|
|
|
+ "[" + str(index + 1) + "]" + \
|
|
|
+ p["relativeXPath"]
|
|
|
element = self.browser.find_element(
|
|
|
By.XPATH, full_path, iframe=p["iframe"])
|
|
|
else:
|
|
@@ -1390,6 +1437,8 @@ if __name__ == '__main__':
|
|
|
if sys.platform == "win32" and platform.architecture()[0] == "32bit":
|
|
|
options.binary_location = os.path.join(
|
|
|
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
|
|
|
+ option.binary_location = os.path.join(
|
|
|
+ os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
|
|
|
driver_path = os.path.join(
|
|
|
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
|
|
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
|
@@ -1397,12 +1446,15 @@ if __name__ == '__main__':
|
|
|
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
|
|
|
options.binary_location = os.path.join(
|
|
|
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
|
|
|
+ option.binary_location = os.path.join(
|
|
|
+ os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
|
|
|
driver_path = os.path.join(
|
|
|
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
|
|
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
|
|
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
|
|
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
|
|
|
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
|
|
|
+ option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
|
|
|
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
|
|
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
|
|
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
|
@@ -1422,6 +1474,7 @@ if __name__ == '__main__':
|
|
|
print("Finding chromedriver in EasySpider",
|
|
|
os.getcwd()+"/ElectronJS")
|
|
|
option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
|
|
|
+ options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
|
|
|
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
|
|
|
option.add_extension("../ElectronJS/XPathHelper.crx")
|
|
|
else:
|
|
@@ -1431,10 +1484,7 @@ if __name__ == '__main__':
|
|
|
|
|
|
option.add_experimental_option(
|
|
|
'excludeSwitches', ['enable-automation']) # 以开发者模式
|
|
|
- options.add_argument('-ignore-certificate-errors')
|
|
|
- options.add_argument('-ignore -ssl-errors')
|
|
|
- option.add_argument('-ignore-certificate-errors')
|
|
|
- option.add_argument('-ignore -ssl-errors')
|
|
|
+
|
|
|
# user_data_dir = r'' # 注意没有Default!
|
|
|
|
|
|
# options.add_argument('--user-data-dir='+p)
|
|
@@ -1496,6 +1546,8 @@ if __name__ == '__main__':
|
|
|
except:
|
|
|
cloudflare = 0
|
|
|
if cloudflare == 0:
|
|
|
+ options.add_argument('log-level=3') # 隐藏日志
|
|
|
+ option.add_argument('log-level=3') # 隐藏日志
|
|
|
options.add_experimental_option("prefs", {
|
|
|
# 设置文件下载路径
|
|
|
"download.default_directory": "Data/Task_" + str(i),
|
|
@@ -1526,10 +1578,9 @@ if __name__ == '__main__':
|
|
|
options=options, chrome_options=option, executable_path=driver_path)
|
|
|
elif cloudflare == 1:
|
|
|
if sys.platform != "darwin":
|
|
|
+ options.binary_location = "" # 需要用自己的浏览器
|
|
|
browser_t = MyUCChrome(
|
|
|
- options=options, chrome_options=option, driver_executable_path=driver_path)
|
|
|
- print("Pass Cloudflare Mode")
|
|
|
- print("过Cloudflare验证模式")
|
|
|
+ options=options, driver_executable_path=driver_path)
|
|
|
else:
|
|
|
print("Not support Cloudflare Mode on MacOS")
|
|
|
print("MacOS不支持Cloudflare验证模式")
|
|
@@ -1556,6 +1607,9 @@ if __name__ == '__main__':
|
|
|
print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
|
|
|
print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
|
|
|
print("----------------------------------\n\n")
|
|
|
+ if cloudflare:
|
|
|
+ print("过Cloudflare验证模式有时候会不稳定,请注意观察上方提示的浏览器版本信息是否正确,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
|
|
|
+ print("Passing the Cloudflare verification mode is sometimes unstable. Please pay attention to whether the browser version information prompted above is correct. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
|
|
|
# 使用监听器监听键盘输入
|
|
|
try:
|
|
|
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
|