python组件介绍_python 中的爬虫· scrapy框架 重要的组件的介绍

一 。  去重的规则组件

去重数据,中通过set() 去重的, 留下的数据存在redis 中,

找到这个类  : from scrapy.dupefilter import RFPDupeFilter

a. 爬虫中yield Request(...dont_filter=False)

b. 类

from scrapy.dupefilter import BaseDupeFilter

import redis

from scrapy.utils.request import request_fingerprint

class XzxDupefilter(BaseDupeFilter):

def __init__(self,key):

self.conn = None

self.key = key

@classmethod

def from_settings(cls, settings):

key = settings.get('DUP_REDIS_KEY')

return cls(key)

def open(self):

self.conn = redis.Redis(host='127.0.0.1',port=6379)

def request_seen(self, request):

fp = request_fingerprint(request)

added = self.conn.sadd(self.key, fp)

return added == 0

c. settings中配置

# 默认dupefilter

# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'

DUPEFILTER_CLASS = 'xzx.dupfilter.XzxDupefilter' # 可以自定义的

这个类给url 添加一个唯一的标识:

from scrapy.utils.request import request_fingerprint

补充:调度器中有一段代码来规定

def enqueue_request(self, request):

# dont_filter=True, => False -> 添加到去重规则:False,True

# dont_filter=False, => True -> 添加到去重规则: False,True

if not request.dont_filter and self.df.request_seen(request):

return False

# 添加到调度器

dqok = self._dqpush(request)

二 。调度器

1. 广度优先 (本质就是栈)

2.深度优先 (本质就是队列)

3. 优先级队列 (redis的有序集合)

三  下载中间件

这个中间件事 调度器 于 下载器之间的中间件。

a.     scrapy中下载中间件的作用?

统一对所有请求批量对request对象进行下载前的预处理。

b. 针对user-agent,默认中间件 内置的默认的执行, 获取的是stettings 中自己配置的user-agent

class UserAgentMiddleware(object):

"""This middleware allows spiders to override the user_agent"""

def __init__(self, user_agent='Scrapy'):

self.user_agent = user_agent # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

@classmethod

def from_crawler(cls, crawler):

o = cls(crawler.settings['USER_AGENT'])

crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)

return o

def spider_opened(self, spider):

self.user_agent = getattr(spider, 'user_agent', self.user_agent)

def process_request(self, request, spider):

if self.user_agent:

request.headers.setdefault(b'User-Agent', self.user_agent)

c. 关于重定向 内置对的默认的

class BaseRedirectMiddleware(object):

enabled_setting = 'REDIRECT_ENABLED'

def __init__(self, settings):

if not settings.getbool(self.enabled_setting):

raise NotConfigured

self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')

self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')

@classmethod

def from_crawler(cls, crawler):

return cls(crawler.settings)

def _redirect(self, redirected, request, spider, reason):

ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)

redirects = request.meta.get('redirect_times', 0) + 1

if ttl and redirects <= self.max_redirect_times:

redirected.meta['redirect_times'] = redirects

redirected.meta['redirect_ttl'] = ttl - 1

redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \

[request.url]

redirected.dont_filter = request.dont_filter

redirected.priority = request.priority + self.priority_adjust

logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",

{'reason': reason, 'redirected': redirected, 'request': request},

extra={'spider': spider})

return redirected

else:

logger.debug("Discarding %(request)s: max redirections reached",

{'request': request}, extra={'spider': spider})

raise IgnoreRequest("max redirections reached")

def _redirect_request_using_get(self, request, redirect_url):

redirected = request.replace(url=redirect_url, method='GET', body='')

redirected.headers.pop('Content-Type', None)

redirected.headers.pop('Content-Length', None)

return redirected

class RedirectMiddleware(BaseRedirectMiddleware):

"""

Handle redirection of requests based on response status

and meta-refresh html tag.

"""

def process_response(self, request, response, spider):

if (request.meta.get('dont_redirect', False) or

response.status in getattr(spider, 'handle_httpstatus_list', []) or

response.status in request.meta.get('handle_httpstatus_list', []) or

request.meta.get('handle_httpstatus_all', False)):

return response

allowed_status = (301, 302, 303, 307, 308)

if 'Location' not in response.headers or response.status not in allowed_status:

return response

location = safe_url_string(response.headers['location'])

redirected_url = urljoin(request.url, location)

if response.status in (301, 307, 308) or request.method == 'HEAD':

redirected = request.replace(url=redirected_url)

return self._redirect(redirected, request, spider, response.status)

redirected = self._redirect_request_using_get(request, redirected_url)

return self._redirect(redirected, request, spider, response.status)

d. 关于cookie 是内置的默认的就执行

用法 自己写的逻辑里 yield 加上meta={“cookieJar”:1}}:

def start_requests(self):

for url in self.start_urls:

yield Request(url=url,callback=self.parse,meta={"cookieJar":1})

class CookiesMiddleware(object):

"""This middleware enables working with sites that need cookies"""

def __init__(self, debug=False):

self.jars = defaultdict(CookieJar)

self.debug = debug

@classmethod

def from_crawler(cls, crawler):

if not crawler.settings.getbool('COOKIES_ENABLED'):

raise NotConfigured

return cls(crawler.settings.getbool('COOKIES_DEBUG'))

def process_request(self, request, spider):

if request.meta.get('dont_merge_cookies', False):

return

# cookiejarkey = 1

cookiejarkey = request.meta.get("cookiejar")

jar = self.jars[cookiejarkey] # CookieJar对象-> 空容器

cookies = self._get_request_cookies(jar, request)

for cookie in cookies:

jar.set_cookie_if_ok(cookie, request)

# set Cookie header

request.headers.pop('Cookie', None)

jar.add_cookie_header(request)

self._debug_cookie(request, spider)

def process_response(self, request, response, spider):

if request.meta.get('dont_merge_cookies', False):

return response

# extract cookies from Set-Cookie and drop invalid/expired cookies

cookiejarkey = request.meta.get("cookiejar")

jar = self.jars[cookiejarkey]

jar.extract_cookies(response, request)

self._debug_set_cookie(response, spider)

return response

def _debug_cookie(self, request, spider):

if self.debug:

cl = [to_native_str(c, errors='replace')

for c in request.headers.getlist('Cookie')]

if cl:

cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)

msg = "Sending cookies to: {}\n{}".format(request, cookies)

logger.debug(msg, extra={'spider': spider})

def _debug_set_cookie(self, response, spider):

if self.debug:

cl = [to_native_str(c, errors='replace')

for c in response.headers.getlist('Set-Cookie')]

if cl:

cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)

msg = "Received cookies from: {}\n{}".format(response, cookies)

logger.debug(msg, extra={'spider': spider})

def _format_cookie(self, cookie):

# build cookie string

cookie_str = '%s=%s' % (cookie['name'], cookie['value'])

if cookie.get('path', None):

cookie_str += '; Path=%s' % cookie['path']

if cookie.get('domain', None):

cookie_str += '; Domain=%s' % cookie['domain']

return cookie_str

def _get_request_cookies(self, jar, request):

if isinstance(request.cookies, dict):

cookie_list = [{'name': k, 'value': v} for k, v in \

six.iteritems(request.cookies)]

else:

cookie_list = request.cookies

cookies = [self._format_cookie(x) for x in cookie_list]

headers = {'Set-Cookie': cookies}

response = Response(request.url, headers=headers)

return jar.make_cookies(response, request)

默认中间件:

DOWNLOADER_MIDDLEWARES_BASE = {

# Engine side

'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,

'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,

'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,

'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,

'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,

'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,

'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,

'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,

'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,

'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,

'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,

'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,

'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,

'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,

# Downloader side

}

注意点:

process_request   不用返回,

1. 如果 有返回response,就会找最后一个process—ressponse

2. 如果返回request , 就到直接根据返回的request 到调度器中执行

process_response:必须有返回值

四  。 爬虫中间件

下载器组件 到 爬虫组件中间件,

默认有 优先级的中间件 和 深度的中间件

编写中间件

class XzxSpiderMiddleware(object):

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the spider middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

return s

def process_spider_input(self, response, spider):

# Called for each response that goes through the spider

# middleware and into the spider.

# Should return None or raise an exception.

return None

def process_spider_output(self, response, result, spider):

# Called with the results returned from the Spider, after

# it has processed the response.

# Must return an iterable of Request, dict or Item objects.

for i in result:

yield i

def process_spider_exception(self, response, exception, spider):

# Called when a spider or process_spider_input() method

# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict

# or Item objects.

pass

def process_start_requests(self, start_requests, spider):

# Called with the start requests of the spider, and works

# similarly to the process_spider_output() method, except

# that it doesn’t have a response associated.

# Must return only requests (not items).

for r in start_requests:

yield r

配置文件:

SPIDER_MIDDLEWARES = {

'xzx.middlewares.XzxSpiderMiddleware': 543,

}

内置爬虫中间件 settings 中的配置 :

深度 :

DEPTH_LIMIT = 8

优先级

DEPTH_PRIORITY = 1, 请求的优先级:0 -1 -2 -3 。。。。

DEPTH_PRIORITY = -1,请求的优先级:0 1 2 3 。。。。

SPIDER_MIDDLEWARES_BASE = {

# Engine side

'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,

'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,

'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,

'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,

'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,

# Spider side

}

总结:

1. DupeFilter

- 默认放在set集合

- url变更为唯一标记

- 将去重规则放到redis中的意义何在?

- 去重+dont_filter

2. 调度器

- 爬虫中什么是深度和广度优先?

- 用什么可以实现?

- 栈

- 队列

- 优先级集合

3,开放封闭原则:

对源码封闭,对配置文件开放, 通过修改配置文件,实现自己想要的功能.

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/288550.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

用java调用.net的wcf其实还是很简单的

前些天和我们的一个邮件服务商对接&#xff0c;双方需要进行一些通讯&#xff0c;对方是java团队&#xff0c;而作为.net团队的我们&#xff0c;只能公布出去的是一个wcf的basicbinding&#xff0c;想不 到问题来了&#xff0c;对方不知道怎么去调用这个basic&#xff0c;可能他…

php 使用PDO,防止sql注入 简单说明

PDO&#xff1a;php5 假如以下是一个简单的登录处理&#xff1a; 使用PDO连接mysql首先&#xff1a; 新建数据库 new PDO("mysql:hostlocalhost;dbnametest","root","root"); host:服务器 dbname:数据库名 后面两个分别是帐号和密码 默认…

DbTool 2.0.0 Released

DbTool 2.0.0 ReleasedIntroDbTool 一个支持 DbFirst、ModelFirst 和 CodeFirst 的数据库小工具。DbFirst 是根据数据库中的表信息生成代码中的 Model&#xff0c;以及生成数据表结构文档ModelFirst 是根据数据表信息或者数据表结构文档生成创建数据库的脚本CodeFirst 是指根据…

[蓝桥杯] 蚂蚁感冒

[蓝桥杯] 蚂蚁感冒 峰值内存消耗 < 256M  CPU消耗 < 1000ms 【题目描述 - Problem Description】 长100厘米的细长直杆子上有n只蚂蚁。它们的头有的朝左&#xff0c;有的朝右。 每只蚂蚁都只能沿着杆子向前爬&#xff0c;速度是1厘米/秒。 当两只蚂蚁碰面时&#xff0…

Source Insight之Relation Window Properties配置和一些快捷键

1 Source Insight之Relation Window Properties配置 我们先点击source Insight的这个地方 然后鼠标右键&#xff0c;点击Relation Window Properties&#xff0c;配置如下 2 快捷键 目前就我知道的 1&#xff09;按亮和按熄这个变量 shift F8 2&#xff09;跳转到具体一行…

ArcGIS 10.2 Calculate Value(Data Management) 工具的使用

1、概述 Calculate Value tool returns a value based on a specified Python expression. 计算值工具返回一个基于特定Python表达式的值。 工具位置:ToolBox→Data Management Tools→General→Calculate Value 2、注意事项 (1)该工具只能用于MoudleBuilder,而不能用于Py…

omnicppcomplete php,VIM 常用法 (三)

这里描述常用的插件。1. NERD tree快捷键&#xff1a;o 在已有窗口中打开文件&#xff0c;并跳到该窗口go 在已有窗口中打开目录或文件&#xff0c;但不跳到该窗口i/s 在split/vsplit的新窗口打开文件&#xff0c;并跳到该窗口gi/si 在split/vslpit的新窗口打开文件&#xff0c…

android:versionCode和android:versionName 用途

Android的版本可以在androidmainfest.xml中定义&#xff0c;主要有android:versionCode和android:versionNameandroid:versionCode:主要是用于版本升级所用&#xff0c;是INT类型的&#xff0c;第一个版本定义为1&#xff0c;以后递增&#xff0c;这样只要判断该值就能确定是否…

js 深复制一个对象

自定义 cloneObj 方法 //深复制对象 var cloneObj function (obj) {var newObj {};if (obj instanceof Array) {newObj [];}for (var key in obj) {var val obj[key];newObj[key] typeof val object ? cloneObj(val) : val;}return newObj; }; 转载于:https://www.cnblo…

js (javascript) 中获取年月日信息

获取年月日的所有信息&#xff1a; 直接使用date(); 如要取得 其他单独年月日时间 &#xff1a; 首先创建一个date对象&#xff0c;例如 &#xff1a; var d new Date(); 随后例如取得当前年 &#xff0c;使用&#xff1a; d.getFullYear() 当前月使用 &#xff1a; d…

vb6编写dll读取dat文件_【STM32Cube_15】使用硬件I2C读取温湿度传感器数据(SHT30)...

寻求更好的阅读体验&#xff0c;请移步Mculover666的个人博客&#xff1a;【STM32Cube_15】使用硬件I2C读取温湿度传感器数据&#xff08;SHT30&#xff09;​www.mculover666.cn本篇详细的记录了如何使用STM32CubeMX配置STM32L431RCT6的硬件I2C外设&#xff0c;读取SHT30温湿度…

windows之如何刷新电脑DNS缓存

1 问题 我在windows电脑里面的终端通过ping 泛域名&#xff0c;可以看到这个域名解析的地址&#xff0c;后面做了这个泛域名解析地址做了修改&#xff0c;但是我ping 这个泛域名地址的时候依然是之前的地址&#xff0c;然后我把终端关闭掉&#xff0c;再次ping这个地址&#x…

tcp/ip ---数据封装过程

转载于:https://www.cnblogs.com/saryli/p/5306721.html

NuGet 新特性 -- 中心化的 NuGet 包版本管理

NuGet 新特性 -- 中心化的 NuGet 包版本管理IntroNuGet 支持了一个可以中心化管理 NuGet 包版本的方案&#xff0c;我们可以在一个地方统一管理 NuGet 包的版本Preface在之前的版本中我们通常在每个指定包版本引用的地方会设置 NuGet 包的版本号&#xff0c;如果项目比较多&…

Java面向对象编程学习

1、新建一个工程&#xff0c;在工程下新建一个类Method01&#xff08;勾选创建main函数&#xff09; package ClassStudy;class Person {String name; //默认为nullint age; //默认为0//构造函数public Person(){namenull;age0;}//方法public void say(){System.out.println(&…

数据库oracle 别名不能更新,数据库oracle改成mysql后Hibernate不能使用别名问题

报错&#xff1a; [INFO ] 11:34:19.272 [http-apr-7081-exec-8] org.hibernate.type.StringType - could not read column value from result set: PK_ID; Column PK_ID not found. [WARN ] 11:34:19.274 [http-apr-7081-exec-8] o.h.util.JDBCExceptionReporter - SQL Error:…

sql 两表更新

UPDATE sale_origin_line set statecancel from sale_origin p,sale_origin_line q where p.idq.order_id and p.company_id3 and p.date<2017-08-01;转载于:https://www.cnblogs.com/1314520xh/p/7338592.html

查询工资最低的3名员工的职工工号、姓名和收入_普法课堂|你有多久没有收到工资条了?...

工资条对于现在的大多数劳动者而言&#xff0c;尤其是90、00后&#xff0c;可能是一个十分陌生的概念和事物&#xff0c;许多用人单位已经不再向劳动者发放&#xff0c;在司法实践中工资条越来越少的在庭审中予以呈现。工资条对于劳动者而言十分重要&#xff0c;即便未有用人单…

js(javascript)取得当前时间小时,分钟,秒 以及毫秒

首先 我们需要new一个date对象&#xff1a; var d new Date(); 随后&#xff0c;取得当前时间小时&#xff1a; d.getHours() 取得当前分钟&#xff1a; d.getMinutes()) 取得当前秒&#xff1a; d.getSeconds() 取得当前毫秒&#xff1a; d.getMilliseconds() 全部…

linux之通过tail命令动态跟踪日志文件里面的末尾信息

1 问题场景 比如在linux系统,我们服务端的错误日志在一个文件里面不断输入进去,我们需要动态查看,我们总不可能每次进行cat文件查看 2 tail命令和head命令 head命令默认是输出一个文件的最前面10行 tail命令默认是输出一个文件的最后面10行 1) -n参数 具体显示多少行 显示…