主程序:
# -*- coding: utf-8 -*- import re, scrapy from urllib.parse import urljoin from nmgepb.items import NmgepbItemclass BasicNmgepbSpider(scrapy.Spider):name = 'basic_nmgepb'allowed_domains = ['nmgepb.gov.cn']start_urls = ['http://nmgepb.gov.cn/']def __init__(self):self.countNum = 1self.startLink ="http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html"def start_requests(self):yield scrapy.Request(url=self.startLink, dont_filter=True, callback=self.link_parse)def customXpathParse(self, value):return ''.join(value).strip()def customReParse(self, condition, index=1):if condition:return condition.group(index).strip()else:return ""def link_parse(self, response):if (len(response.text) < 1000):yield scrapy.Request(url=response.url, dont_filter=True, callback=self.link_parse)else:allLinks = response.xpath("/html/body/div[3]/div/div[3]/div[2]/ul/li/span[2]/a/@href").extract()for link in allLinks:link = urljoin(response.url, link)yield scrapy.Request(url=link, callback=self.info_parse)if (response.url == self.startLink):for pageNum in range(1, 6):link = '{0}_{1}.html'.format(self.startLink.split('.html')[0], pageNum)yield scrapy.Request(url=link, callback=self.link_parse)def info_parse(self, response):if 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472451.html' != response.url and 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472450.html' != response.url and 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472443.html' != response.url:item = NmgepbItem()trData = response.xpath('//table//tr')tableClass = self.customXpathParse(response.xpath('//table/@class').extract())if trData:for data in trData:tdNum = len(data.xpath('./td'))firstTd = self.customXpathParse(data.xpath('./td[1]//text()').extract())lastTd = self.customXpathParse(data.xpath('./td[6]//text()').extract())if (tdNum == 3):if (tableClass == 'MsoTableGrid'):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = ''item['mechanism'] = ''item['date'] = ''if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tableClass == 'FCK__ShowTableBorders'):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['place'] = ''item['company'] = ''item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 6) and (lastTd):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())item['date'] = self.customXpathParse(data.xpath('./td[6]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 5 or tdNum == 6) and (not lastTd):if firstTd.isdigit():item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelse:item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[1]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['mechanism'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['date'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 7):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[7]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 9):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[9]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelse:item['link'] = response.urlitem['title'] = self.customReParse(re.search(r'<strong>项目名称:</strong>(.*?)<', response.text, re.I))item['place'] = self.customReParse(re.search(r'<strong>建设地点:</strong>(.*?)<', response.text, re.I))item['company'] = self.customReParse(re.search(r'<strong>建设单位:</strong>(.*?)<', response.text, re.I))item['mechanism'] = self.customReParse(re.search(r'<strong>环境影响评价机构:</strong>(.*?)<', response.text, re.I))item['date'] = self.customReParse(re.search(r'<strong>受理日期:</strong>(.*?)<', response.text, re.I))if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield item
items:
import scrapyclass NmgepbItem(scrapy.Item):link = scrapy.Field()title = scrapy.Field()place = scrapy.Field()company = scrapy.Field()mechanism = scrapy.Field()date = scrapy.Field()
middlewares:
from scrapy import signalsclass NmgepbSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects. @classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)class NmgepbDownloaderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects. @classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of# installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)
pipelines:
import os, csvclass NmgepbPipeline(object):def __init__(self):self.csvFilePath = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'nmgepb.csv')self.csvFile = open(self.csvFilePath, 'w', encoding='gb18030', newline='')self.csvWrite = csv.writer(self.csvFile)self.csvWrite.writerow(['页面链接', '项目名称', '建设地点', '建设单位', '评价机构', '受理日期'])def process_item(self, item, spider):self.csvWrite.writerow([item.get('link'), item.get('title'), item.get('place'), item.get('company'), item.get('mechanism'), item.get('date')])return itemdef close_spider(self, spider):self.csvFile.close()print("恭喜, 数据采集完成, 存储路径:%s"%self.csvFilePath)
settings(加入):
DEFAULT_REQUEST_HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0" } ITEM_PIPELINES = {'nmgepb.pipelines.NmgepbPipeline': 300, }