本博文使用自动化爬虫框架完成微信开放社区文档信息的爬取(重点理解 scrapy 框架自动化爬
虫构建过程,能够分析 LinkExtractor 和 Rule 规则的基本用法)
包结构目录如下图所示:
![](https://i-blog.csdnimg.cn/direct/bb08145af9d14d0bbd1bd76557ee0fbf.png)
主要代码:
( items.py )
import scrapy
class ArticleItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
author = scrapy.Field()
(weixin_spider.py)
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from weixin_spider.items import ArticleItem
class WeixinSpider(CrawlSpider):
name = 'weixin_spider' allowed_domains = ['weixin.qq.com']
start_urls =
['https://developers.weixin.qq.com/community/develop/doc/0008c44fa43768b7f22a3b3e151c0
0']
rules = (
Rule(LinkExtractor(allow=r'/community/develop/doc/\w+'), callback='parse_article'), )
def parse_article(self, response):
item = ArticleItem()
item['title'] = response.xpath('//title/text()').get()
item['content'] = response.xpath('//body').get() # 举例获取整个 body 内容,具体根
据实际情况提取
item['author'] = response.xpath('//author/text()').get() # 假设有作者信息
yield item
爬取数据如下图所示: