本文使用scrapy对某一个网站静态数据进行了抓取
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import request
import requests
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')#中文字符不能被识别报错
class spider(scrapy.Spider):name='picSpider'allowed_domains=[]urls=[]for i in range(1,400):if i==1:urls.append('http://www.***.com/pic/12/')else:urls.append('http://www.***.com/pic/12/p_'+str(i)+'.html')start_urls=urlsdef parse(self, response):#title=response.xpath("//div[@class='box list channel']/ul/li/a/text()").extract()link=response.xpath("//div[@class='box list channel']/ul/li/a/@href").extract()for l in link:url='http://www.***.com'+lre=scrapy.Request(url,callback=self.parse_page)#子页面2层爬yield redef parse_page(self, response):title=response.xpath("//h1/text()").extract()#名字path=os.path.join('d:/dd',title[0])if os.path.exists(path) is False:os.mkdir(path)for i in response.xpath("//div[@class='post']/img/@src").extract():name=os.path.join(path,i.split('/')[-1])pic=requests.get(i,timeout=10)f=open(name,'wb')f.write(pic.content)f.close()