数据入库
先创建一个数据库
create table book(id int primary key auto_increment,name varchar(128),src varchar(128));
settings.py
DB_HOST = '169.254.38.183'
# 端口号是一个整数
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWORD = '123456'
# 数据库名称
DB_NAME = 'spider01'
DB_CHARSET = 'utf8'# 管道
ITEM_PIPELINES = {"scarpy_readbook_41.pipelines.ScarpyReadbook41Pipeline": 300,'scarpy_readbook_41.pipelines.MysqlPipeline': 301
}
pipelines.py
# 加载settings文件
from scrapy.utils.project import get_project_settings
import pymysqlclass MysqlPipeline:def open_spider(self, spider):settings = get_project_settings()self.host = settings['DB_HOST']self.port = settings['DB_PORT']self.user = settings['DB_USER']self.password = settings['DB_PASSWORD']self.name = settings['DB_NAME']self.charset = settings['DB_CHARSET']self.coonect()def coonect(self):self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.name, charset=self.charset)self.cursor = self.conn.cursor()def process_item(self, item, spider):sql = 'insert into book(name,src) values("{}","{}")'.format(item['name'], item['src'])# 执行sql语句self.cursor.execute(sql)# 提交self.conn.commit()return itemdef close_spider(self, spider):self.cursor.close()self.conn.close()
链接跟进:
在read.py里follow改成=True
rules = (Rule(LinkExtractor(allow=r"/book/1206_\d+\.html"), callback="parse_item", follow=True),)