#下载网页 #具有功能:捕获异常,重试下载并设置用户代理 import urllib.request import urllib.error #下载网页 #wscp:默认用户代理 web scraping with python 缩写 def download(url, user_agent='wscp',num_retries=2):print('Downloading:',url) #打印下载路径headers={'USer-Agent':user_agent}request=urllib.request.Request(url,headers=headers)try:html=urllib.request.urlopen(request).read()except urllib.error.URLError as e:print('download error:',e.reason)html=Noneif num_retries>0:#下载遇到错误时尝试下载if hasattr(e,'code') and 500 <=e.code <600:#404 notfound 这种错误,说明网页不存在,故不需要重新下载print(user_agent)return download(url,user_agent,num_retries-1)return html download('http://example.webscraping.com/') download('http://httpstat.us/500')#测试错误500 # print(dir(urllib))
Downloading: http://example.webscraping.com/ Downloading: http://httpstat.us/500 download error: Internal Server Error wscp Downloading: http://httpstat.us/500 download error: Internal Server Error wscp Downloading: http://httpstat.us/500 download error: Internal Server Error