1、创建Scrapy项目
使用全局命令startproject创建项目,创建新文件夹并且使用命令进入文件夹,创建一个名为jingdong的Scrapy项目。
2.使用项目命令genspider创建Spider
3、发送请求,接受响应,提取数据
# -*- coding: utf-8 -*-
import scrapyfrom jingdong.items import JingdongItemclass JdSpider(scrapy.Spider):name = "jd"allowed_domains = ["www.jd.com"]start_urls = ['http://www.jd.com/']search_url1 = 'https://search.jd.com/Search?keyword={key}&enc=utf-8&page={page}'#search_url2='https://search.jd.com/s_new.php?keyword={key}&enc=utf-8&page={page}&scrolling=y&pos=30&show_items={goods_items}'search_url2= 'https://search.jd.com/s_new.php?keyword={key}&enc=utf-8&page={page}&s=26&scrolling=y&pos=30&tpl=3_L&show_items={goods_items}'shop_url ='http://mall.jd.com/index-{shop_id}.html'def start_requests(self):key = '长裤'for num in range(1,100 ):page1 = str(2*num-1)#构造页数page2 = str(2*num)yield scrapy.Request(url=self.search_url1.format(key=key,page=page1),callback=self.parse,dont_filter = True)yield scrapy.Request(url=self.search_url1.format(key=key,page=page1),callback=self.get_next_half,meta={'page2':page2,'key':key},dont_filter = True)def get_next_half(self,response):try:items = response.xpath('//*[@id="J_goodsList"]/ul/li/@data-pid').extract()key = response.meta['key']page2 =response.meta['page2']goods_items=','.join(items)yield scrapy.Request(url=self.search_url2.format(key=key, page=page2, goods_items=goods_items),callback=self.next_parse,dont_filter=True)#这里不加这个的话scrapy会报错dont_filter,官方是说跟allowed_domains冲突except Exception as e:print('没有数据')def parse(self, response):all_goods = response.xpath('//div[@id="J_goodsList"]/ul/li')for one_good in all_goods:item = JingdongItem()try:data = one_good.xpath('div/div/a/em')item['title'] = data.xpath('string(.)').extract()[0]#提取出该标签所有文字内容item['comment_count'] = one_good.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()[0]#评论数item['goods_url'] = 'http:'+one_good.xpath('div/div[4]/a/@href').extract()[0]#商品链接item['shops_id']=one_good.xpath('div/div[@class="p-shop"]/@data-shopid').extract()[0]#店铺IDitem['shop_url'] =self.shop_url.format(shop_id=item['shops_id'])goods_id=one_good.xpath('div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]if goods_id:item['goods_id'] =goods_idprice=one_good.xpath('div/div[3]/strong/i/text()').extract()#价格if price:#有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉item['price'] =price[0]#print(item)yield itemexcept Exception as e:passdef next_parse(self,response):all_goods=response.xpath('/html/body/li')for one_good in all_goods:item = JingdongItem()try:data = one_good.xpath('div/div/a/em')item['title'] = data.xpath('string(.)').extract()[0] # 提取出该标签所有文字内容item['comment_count'] = one_good.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()[0] # 评论数item['goods_url'] = 'http:' + one_good.xpath('div/div[4]/a/@href').extract()[0] # 商品链接item['shops_id'] = one_good.xpath('div/div[@class="p-shop"]/@data-shopid').extract()[0] # 店铺IDitem['shop_url'] = self.shop_url.format(shop_id=item['shops_id'])goods_id = one_good.xpath('div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]if goods_id:item['goods_id'] = goods_idprice = one_good.xpath('div/div[3]/strong/i/text()').extract() # 价格if price: # 有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉item['price'] = price[0]yield item# print(item)except Exception as e:pass# print(e,'没有数据')
4.pipelines设置保存文件,创建mysql数据库,设置表格:
# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from pymongo import MongoClientclass JingdongPipeline(object):# def __init__(self):# self.client = MongoClient()# self.database = self.client['jingdong']# self.db = self.database['jingdong_infomation']## def process_item(self, item, spider):#这里以每个用户url_token为ID,有则更新,没有则插入# self.db.update({'goods_id':item['goods_id']},dict(item),True)# return item## def close_spider(self,spider):# self.client.close()def __init__(self):self.conn = pymysql.connect(host='127.0.0.1',port=3306,user ='root',passwd='mysql',db='jingdong',charset='utf8')self.cursor = self.conn.cursor()def process_item(self, item, spider):try:#有些标题会重复,所以添加异常title = item['title']comment_count = item['comment_count'] # 评论数shop_url = item['shop_url'] # 店铺链接price = item['price']goods_url = item['goods_url']shops_id = item['shops_id']goods_id =int(item['goods_id'])#sql = 'insert into jingdong_goods(title,comment_count,shop_url,price,goods_url,shops_id) VALUES (%(title)s,%(comment_count)s,%(shop_url)s,%(price)s,%(goods_url)s,%(shops_id)s,)'try:self.cursor.execute("insert into jingdong_goods(title,comment_count,shop_url,price,goods_url,shops_id,goods_id)values(%s,%s,%s,%s,%s,%s,%s)", (title,comment_count,shop_url,price,goods_url,shops_id,goods_id))self.conn.commit()except Exception as e:passexcept Exception as e:pass# def close_spider(self):# self.conn.close()
5.配置settings设置
# -*- coding: utf-8 -*-# Scrapy settings for jingdong project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'jingdong'SPIDER_MODULES = ['jingdong.spiders']
NEWSPIDER_MODULE = 'jingdong.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jingdong.middlewares.MyCustomSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'jingdong.middlewares.MyCustomDownloaderMiddleware': 543,
#}# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'jingdong.pipelines.JingdongPipeline': 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
6.进行爬取:执行项目命令crawl,启动Spider: