spides.py
# -*- coding: utf-8 -*-
import scrapy
from weather.items import WeatherItem
from scrapy.crawler import CrawlerProcess
import re
'''
多级分类爬取
'''
class IgxSpider(scrapy.Spider):
name = 'igx_result'
allowed_domains = ['www.igxpt.com']
# start_urls = ['http://www.igxpt.com/cate/192/']
def start_requests(self):
start_urls = ['http://www.igxpt.com/cate/{}/'.format(str(i)) for i in range(192, 194)] #这里我是简写的,当然也可以进入主页面,爬取这些url 那就要多一级分类了
for url in start_urls:
yield scrapy.Request(url=url)
def parse(self, response):
'''得到分页页码-----start'''
page = response.xpath('//div[@class="dataTables_paginate paging_simple_numbers"]/span/text()').extract_first()
ret = re.search('共(\d+)页', page)
number = ret.group(1)
#print(page,number)
page_link = response.xpath('//ul[@class="pagination"]/li/a/@href').extract_first()
current_url = 'http://www.igxpt.com'+page_link.split('=')[0]+'='
'''得到分页页码-----end'''
clearfix = response.xpath('//ul[@class="shop-list-recommend mt20 clearfix"]/li')
for li in clearfix:
item = WeatherItem()
item['name'] = li.xpath('./a/p[1]/text()').extract_first()
url_img = li.xpath('./a/div/img/@src').extract_first()
item['url'] = "http://www.igxpt.com" + (url_img)
price_alia = li.xpath('./a/p[2]/span[@class="blue"]/text()').extract_first()
item['price'] = price_alia + "元"
yield item
# 拼接url 递归调用分页
urls = [current_url + '{}'.format(str(i)) for i in range(1, int(number) + 1)]
for se in urls:
yield scrapy.Request(url=se, callback=self.parse)
items.py
import scrapy
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url = scrapy.Field()
price= scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import urllib.request
class WeatherPipeline(object):
def process_item(self, item, spider):
name = item['name']
url = item['url']
price= item['price']
connection = pymysql.connect(
host='127.0.0.1',
user='root',
passwd='root',
db='scrapy',
# charset='utf-8',
cursorclass=pymysql.cursors.DictCursor
)
try:
# '''下载图片'''
# imgname = url.split('/')[-1]
# path = r"D:\Python\weather\weather\images\%s" % (imgname)
# urllib.request.urlretrieve(url, filename=path)
'''插入数据库'''
with connection.cursor() as cursor:
sql = """INSERT INTO `goods_info_detail` (name, url, price) VALUES (%s, %s, %s) """
cursor.execute(
sql,(name,url,price)
)
connection.commit()
except ValueError as e:
print(e)
finally:
connection.close()
return item
settings.py
LOG_LEVEL = 'WARNING'
BOT_NAME = 'weather'
SPIDER_MODULES = ['weather.spiders']
NEWSPIDER_MODULE = 'weather.spiders'
'''
管道
'''
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
}