前言
最近使用selenium、selenium-wire爬取数据,在使用代理时查阅很多资料,在使用过程中发现很多资料、博客中都是错误的用法,误导初学selenium使用代理的开发者
描述:
我这里使用的是Python 3.12.2 selenium==4.23.1 selenium-wire==5.1.0
1.selenium使用代理
1.1核心代码
注意这里是python selenium使用代理的方法(原生selenium),亲测selenium-wire不可以这么用,这么用使用代理是不生效的,有些博客上说selenium-wire使用下面的写法不报错,完全是误导大家,selenium-wire使用下面的写法,根本就没使用代理ip(没连接代理ip),又怎么能报错?
建议大家使用稳定的、支持https的代理ip,支持https的代理才能访问https的网站,不要使用免费代理,懂的都懂
写法一
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}") # options.add_argument("--proxy-server=117.86.185.68:8089")
写法二
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server=http://{ip_port}") # options.add_argument("--proxy-server=http://117.86.185.68:8089")
1.2检验是否使用代理
try:url = "https://httpbin.org/ip"driver.get(url)print(driver.page_source)
except Exception as e:print(e)
finally:driver.quit()
1.3完整代码
chrome浏览器(chromium), chrome-win里面包含chrome和chromedriver(个人整理的),浏览器版本和chromedriver版本一致 114.0.5735.90 ,如有需要可自行提取
链接:https://pan.baidu.com/s/1vv6AfmCBFx8QDA7RE2VrIg
提取码:6666
import osfrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}") # options.add_argument("--proxy-server=117.86.185.68:8089")
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15) # 设置js加载超时时间为15秒
try:url = "https://httpbin.org/ip"driver.get(url)print(driver.page_source)
except Exception as e:print(e)
finally:driver.quit()
2.selenium-wire使用代理
2.1核心代码
方式一(浏览器启动前配置)
seleniumwire_options={'proxy': {'http': 'http://180.127.3.147:8090', # 这里使用自己的代理ip和端口'https': 'https://180.127.3.147:8090',# 这里使用自己的代理ip和端口'no_proxy': 'localhost,127.0.0.1'}
}
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options, seleniumwire_options=seleniumwire_options)
方式二(动态切换)
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {'http': 'http://180.127.3.147:8090','https': 'https://180.127.3.147:8090'
}
2.2检验是否使用代理
try:url = "https://httpbin.org/ip"driver.get(url)print(driver.page_source)
except Exception as e:print(e)
finally:driver.quit()
2.3完整代码
import osfrom seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# # selenium-wire使用代理方式一
# seleniumwire_options = {
# 'proxy': {
# 'http': 'http://180.127.3.147:8090', # 这里使用自己的代理ip和端口
# 'https': 'https://180.127.3.147:8090', # 这里使用自己的代理ip和端口
# 'no_proxy': 'localhost,127.0.0.1'
# }
# }
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {'http': 'http://180.127.3.147:8090','https': 'https://180.127.3.147:8090'
}
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15) # 设置js加载超时时间为15秒
try:url = "https://httpbin.org/ip"driver.get(url)print(driver.page_source)
except Exception as e:print(e)
finally:driver.quit()
3.使用代理插件Selenium-Chrome-HTTP-Private-Proxy
3.1创建插件的方法(此方法可以封装在工具类里使用)
import string
import zipfile# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):"""Proxy Auth Extensionargs:proxy_host (str): domain or ip address, ie proxy.domain.comproxy_port (int): portproxy_username (str): auth usernameproxy_password (str): auth passwordkwargs:scheme (str): proxy scheme, default httpplugin_path (str): absolute path of the extensionreturn str -> plugin_path"""if plugin_path is None:plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'manifest_json = """{"version": "1.0.0","manifest_version": 2,"name": "Chrome Proxy","permissions": ["proxy","tabs","unlimitedStorage","storage","<all_urls>","webRequest","webRequestBlocking"],"background": {"scripts": ["background.js"]},"minimum_chrome_version":"22.0.0"}"""background_js = string.Template("""var config = {mode: "fixed_servers",rules: {singleProxy: {scheme: "${scheme}",host: "${host}",port: parseInt(${port})},bypassList: ["foobar.com"]}};chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});function callbackFn(details) {return {authCredentials: {username: "${username}",password: "${password}"}};}chrome.webRequest.onAuthRequired.addListener(callbackFn,{urls: ["<all_urls>"]},['blocking']);""").substitute(host=proxy_host,port=proxy_port,username=proxy_username,password=proxy_password,scheme=scheme,)with zipfile.ZipFile(plugin_path, 'w') as zp:zp.writestr("manifest.json", manifest_json)zp.writestr("background.js", background_js)return plugin_path
3.2核心代码
proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(proxy_host=proxy_config[0],proxy_port=proxy_config[1],proxy_username=proxy_config[2],proxy_password=proxy_config[3])
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)
3.3 完整代码
我这里为方便演示,创建浏览插件的方法(create_proxyauth_extension)就写在一起了,建议create_proxyauth_extension方法封装成一个工具类来调用,可以提高代码的可阅读性和整洁性
selenium和selenium-wire使用浏览器代理插件用法是一样
import os
import string
import zipfile# from seleniumwire import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):"""Proxy Auth Extensionargs:proxy_host (str): domain or ip address, ie proxy.domain.comproxy_port (int): portproxy_username (str): auth usernameproxy_password (str): auth passwordkwargs:scheme (str): proxy scheme, default httpplugin_path (str): absolute path of the extensionreturn str -> plugin_path"""if plugin_path is None:plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'manifest_json = """{"version": "1.0.0","manifest_version": 2,"name": "Chrome Proxy","permissions": ["proxy","tabs","unlimitedStorage","storage","<all_urls>","webRequest","webRequestBlocking"],"background": {"scripts": ["background.js"]},"minimum_chrome_version":"22.0.0"}"""background_js = string.Template("""var config = {mode: "fixed_servers",rules: {singleProxy: {scheme: "${scheme}",host: "${host}",port: parseInt(${port})},bypassList: ["foobar.com"]}};chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});function callbackFn(details) {return {authCredentials: {username: "${username}",password: "${password}"}};}chrome.webRequest.onAuthRequired.addListener(callbackFn,{urls: ["<all_urls>"]},['blocking']);""").substitute(host=proxy_host,port=proxy_port,username=proxy_username,password=proxy_password,scheme=scheme,)with zipfile.ZipFile(plugin_path, 'w') as zp:zp.writestr("manifest.json", manifest_json)zp.writestr("background.js", background_js)return plugin_pathcurrent_path = os.getcwd() # 当前文件所在的文件夹路径
# 指定谷歌的位置
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# ChromeDriver的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# option设置,传入Chrome浏览器的路径
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])# 代理ip,端口,账号,密码,有账号密码的就填写账号密码,没有空着即可
proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(proxy_host=proxy_config[0],proxy_port=proxy_config[1],proxy_username=proxy_config[2],proxy_password=proxy_config[3])
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(8)
try:url = "https://httpbin.org/ip"driver.get(url)print(driver.page_source)
except Exception as e:print(e)
finally:driver.quit()
源代码地址:https://gitee.com/jxzcode_admin/flask-project.git
参考资料
https://pypi.org/project/selenium-wire/#socks
https://blog.csdn.net/zwq912318834/article/details/78626739
https://www.cnblogs.com/roystime/p/6935543.html