爬虫中间件
# 爬虫中间件 (了解) middlewares.py
class MysfirstscrapySpiderMiddleware:@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_spider_input(self, response, spider):return Nonedef process_spider_output(self, response, result, spider):for i in result:yield idef process_spider_exception(self, response, exception, spider):passdef process_start_requests(self, start_requests, spider):for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)
下载中间件
-进来request对象
-加代理
-加cookie
-加请求头
-出去response对象
-修改响应对象,最后进入到爬虫的parser中就是修改后的response
# 下载中间件
class MysfirstscrapyDownloaderMiddleware:@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s# 请求来了执行def process_request(self, request, spider):# 返回值可以是如下# return None:继续处理本次请求,执行执行下一个中间件的process_request#return Response:执行当前中间件的process_response回去,进入到引擎,被调度,进入第6步,返回到爬虫的解析方法中# return a Request:直接返回,给引擎,被调度,进入第2步,进入调度器等待下次被调度爬取# raise IgnoreRequest:执行 process_exceptionreturn None# 请求走了def process_response(self, request, response, spider):# 返回如下# return Response :继续往后走,进入到引擎,被调度到爬虫中解析# return Request :进入到引擎,被调度进调度器# - or raise IgnoreRequest:会执行process_exceptionreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)# 在配置文件中配置
1.加代理
# 在下载中间件的def process_request(self, request, spider):写代码# 第一步:-在下载中间件写process_request方法def get_proxy(self):import requestsres = requests.get('http://127.0.0.1:5010/get/').json()if res.get('https'):return 'https://' + res.get('proxy')else:return 'http://' + res.get('proxy')def process_request(self, request, spider):request.meta['proxy'] = self.get_proxy()return None# 第二步:代理可能不能用,会触发process_exception,在里面写def process_exception(self, request, exception, spider):print('-----',request.url) # 这个地址没有爬return request
2.加cookie,修改请求头,随机生成UserAgent
2.1加cookie
def process_request(self, request, spider):print(request.cookies)request.cookies['name']='lqz'return None
2.2 修改请求头
def process_request(self, request, spider):print(request.headers)request.headers['referer'] = 'http://www.lagou.com'return None
2.3 动态生成User-agent使用
需要先安装模块
pip insttall fake_useragent
def process_request(self, request, spider):# fake_useragent模块from fake_useragent import UserAgentua = UserAgent()request.headers['User-Agent']=str(ua.random)print(request.headers)return None