用selenium包。
# -*- coding: utf-8 -*-
# 说明: 当前的浏览器驱动用的是edge浏览器.
# 需要下载浏览器驱动,并把它的路径添加到环境变量
from selenium import webdriver
import time
driver = webdriver.Edge()
homepage_url = 'http://10.180.10.93:8000/'
driver.get(homepage_url)
driver.quit()
打开xx页面
nodes = driver.find_elements_by_tag_name('a')
driver.maximize_window()
driver.implicitly_wait(6) # zhe'shi'shen'm
和dom找元素很像
nodes = node[0],然后node.click()模拟鼠标点击。
前进、后退
driver.forward()
driver.back()
if node.get_attribute('target') == '_blank': # 如果是 _target = blank 的话,打开新的选项卡(标签),随后关闭它
node.click()
time.sleep(SLEEPING_SECOND)
assert driver.title == EXPECTED_TITLE
windows = driver.window_handles
driver.switch_to.window(windows[1])
driver.close()
driver.switch_to.window(windows[0])
测试把网页上所有的链接都点一变。
# -*- coding: utf-8 -*-
# 说明: 当前的浏览器驱动用的是edge浏览器.
# 需要下载浏览器驱动,并把它的路径添加到环境变量
from __future__ import print_function
from selenium import webdriver
import time
driver = webdriver.Edge()
homepage_url = 'http://10.180.10.93:8000/'
SLEEPING_SECOND = 1
driver.get(homepage_url)
time.sleep(SLEEPING_SECOND)
url_visited = set()
# 不需要访问的页面
url_filtered = {
'https://www.djangoproject.com/',
'https://purecss.io/',
'https://www.pgadmin.org/',
'http://quantlib.org/index.shtml',
'https://www.pgadmin.org/docs/',
'http://quantlib.org/quantlibaddin/extend_tutorial.html#extend_autogenerate',
'http://quantlib.org/quantlibxl/faq.html#faq_item_debugexcel',
'https://pypi.python.org/pypi',
'https://www.anaconda.com/download/#windows',
'http://quantlib.org/install/vc10.shtml',
'http://quantlib.org/quantlibxl/',
'http://sourceforge.net/projects/boost/files/boost-binaries/',
'https://www.postgresql.org/',
'http://www.cnblogs.com/newpanderking/articles/3372969.html',
'http://www.pgadmin.org/',
'http://quantlib.org/quantlibaddin/build_qlxl.html',
'https://sourceforge.net/p/quantlib/mailman/quantlib-dev/?style=flat&viewmonth=201101',
'http://quantlib.10058.n7.nabble.com/problems-compiling-QuantLibXL-from-td15602.html'
}
EXPECTED_TITLE = 'Homepage of Quant Team One'
def traverse():
print(url_visited)
nodes = driver.find_elements_by_tag_name('a')
if len(nodes) == 0:
return
else:
for i in range(len(nodes)):
print('当前:%s' % driver.current_url)
if 'ErrorStatus=0x800C0005' in driver.current_url:
raise ValueError('站点无法访问')
assert driver.title == EXPECTED_TITLE
nodes = driver.find_elements_by_tag_name('a') # 每次都要重新获取啊 -_-!! 不然要报错: 此节点已过时
node = nodes[i]
url = node.get_attribute("href")
if node.text in ['\n \n \n ']: # 这个超链接是在哪里出现的 -_-!! 好诡异 -_-!
continue
if url in (url_visited | url_filtered): # 不需要访问的页面 (已经访问过的 & 确实不需要访问的)
continue
print('%s, %s' % (node.text, url))
try:
if node.get_attribute('target') == '_blank': # 如果是 _target = blank 的话,打开新的选项卡(标签),随后关闭它
node.click()
time.sleep(SLEEPING_SECOND)
assert driver.title == EXPECTED_TITLE
url_visited.add(url)
windows = driver.window_handles
driver.switch_to.window(windows[1])
driver.close()
driver.switch_to.window(windows[0])
else:
node.click()
time.sleep(SLEEPING_SECOND)
assert driver.title == EXPECTED_TITLE
url_visited.add(url)
traverse()
driver.back()
except Exception as e:
print(e)
pass
url_visited.add(driver.current_url)
traverse()
driver.quit()