python爬虫之feapder.AirSpider轻量爬虫案例:豆瓣
创建feaderSpider项目:feapder create -p feapderSpider,已创建可忽略 进入feapderSpider目录:cd .\ feapderSpider\spiders 创建爬虫:feapder create -s airSpiderDouban,选择AirSpider爬虫模板,可跳过1、2直接创建爬虫文件 配置邮件报警:报警配置163邮箱,https://feapder.com/#/source_code/%E6%8A%A5%E8%AD%A6%E5%8F%8A%E7%9B%91%E6%8E%A7 setting.py打开mysql配置,无setting.py文件,运行命令feapder create --setting# # MYSQL
MYSQL_IP = "localhost"
MYSQL_PORT = 3306
MYSQL_DB = "video"
MYSQL_USER_NAME = "root"
MYSQL_USER_PASS = "root"# # REDIS
# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
REDISDB_IP_PORTS = "localhost:6379"
REDISDB_USER_PASS = ""
REDISDB_DB = 0
# 连接redis时携带的其他参数,如ssl=True
REDISDB_KWARGS = dict()
# 适用于redis哨兵模式
REDISDB_SERVICE_NAME = ""# # 去重
ITEM_FILTER_ENABLE = True # item 去重
ITEM_FILTER_SETTING = dict(filter_type=1, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4name="douban"
)# # 邮件报警
EMAIL_SENDER = "123123123@163.com" # 发件人
EMAIL_PASSWORD = "EYNXMBWJKMLZFTKQ" # 授权码
EMAIL_RECEIVER = ["123123123@163.com"] # 收件人 支持列表,可指定多个
EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
创建item:feapder create -i douban,选择item,需要现在数据库创建表CREATE TABLE IF NOT EXISTS douban(id INT AUTO_INCREMENT,title VARCHAR(255),rating FLOAT,quote VARCHAR(255),intro TEXT,PRIMARY KEY(id))
修改douban_item.py文件# -*- coding: utf-8 -*-
"""
Created on 2023-10-08 16:17:51
---------
@summary:
---------
@author: Administrator
"""from feapder import Itemclass DoubanItem(Item):"""This class was generated by feapdercommand: feapder create -i douban"""__table_name__ = "douban"__unique_key__ = ["title","quote","rating","title"] # 指定去重的key为 title、quote,最后的指纹为title与quote值联合计算的md5def __init__(self, *args, **kwargs):super().__init__(**kwargs)# self.id = Noneself.intro = Noneself.quote = Noneself.rating = Noneself.title = None
爬虫文件:air_spider_douban.py# -*- coding: utf-8 -*-
"""
Created on 2023-10-06 15:36:09
---------
@summary:
---------
@author: Administrator
"""import feapder
from items.douban_item import DoubanItem
from feapder.network.user_agent import get as get_ua
from requests.exceptions import ConnectTimeout,ProxyError
from feapder.utils.email_sender import EmailSender
import feapder.setting as settingclass AirSpiderDouban(feapder.AirSpider):def __init__(self, thread_count=None):super().__init__(thread_count)self.request_url = 'https://movie.douban.com/top250'def start_requests(self):yield feapder.Request(self.request_url)def download_midware(self, request):request.headers = {'User-Agent': get_ua()}return requestdef parse(self, request, response):video_list = response.xpath('//ol[@class="grid_view"]/li')for li in video_list:item = DoubanItem()item['title'] = li.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first()item['rating'] = li.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()').extract_first()item['quote'] = li.xpath('.//div[@class="bd"]//p[@class="quote"]/span/text()').extract_first()detail_url = li.xpath('.//div[@class="hd"]/a/@href').extract_first()if detail_url:yield feapder.Request(detail_url, callback=self.get_detail_info, item=item)# 获取下一页数据next_page_url = response.xpath('//div[@class="paginator"]//link[@rel="next"]/@href').extract_first()if next_page_url:yield feapder.Request(next_page_url,callback=self.parse)def get_detail_info(self, request, response):item = request.itemdetail = response.xpath('//span[@class="all hidden"]/text()').extract_first() or ''if not detail:detail = response.xpath('//div[@id="link-report-intra"]/span[1]/text()').extract_first() or ''item['intro'] = detail.strip()yield itemdef exception_request(self, request, response, e):prox_err = [ConnectTimeout,ProxyError]if type(e) in prox_err:request.del_proxy()def end_callback(self):with EmailSender(setting.EMAIL_SENDER,setting.EMAIL_PASSWORD) as email_sender:email_sender.send(setting.EMAIL_RECEIVER, title='python',content="爬虫结束")if __name__ == "__main__":AirSpiderDouban(thread_count=5).start()
feapder create -p feapderSpider根据该命令创建的项目下会有main文件,除了单独运行爬虫文件,可以在main文件中运行,一般用于运行多个爬虫from feapder import ArgumentParser
from spiders import *def crawl_air_spider_douban():"""AirSpider爬虫"""spider = air_spider_douban.AirSpiderDouban()spider.start()if __name__ == "__main__":parser = ArgumentParser(description="爬虫练习")parser.add_argument("--crawl_air_spider_douban", action="store_true", help="豆瓣AirSpide", function=crawl_air_spider_douban)parser.run("crawl_air_spider_douban")
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/122386.shtml
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!