scrapy之内蒙古自治区环境保护厅

主程序:

# -*- coding: utf-8 -*-
import re, scrapy
from urllib.parse import urljoin
from nmgepb.items import NmgepbItemclass BasicNmgepbSpider(scrapy.Spider):name = 'basic_nmgepb'allowed_domains = ['nmgepb.gov.cn']start_urls = ['http://nmgepb.gov.cn/']def __init__(self):self.countNum = 1self.startLink ="http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html"def start_requests(self):yield scrapy.Request(url=self.startLink, dont_filter=True, callback=self.link_parse)def customXpathParse(self, value):return ''.join(value).strip()def customReParse(self, condition, index=1):if condition:return condition.group(index).strip()else:return ""def link_parse(self, response):if (len(response.text) < 1000):yield scrapy.Request(url=response.url, dont_filter=True, callback=self.link_parse)else:allLinks = response.xpath("/html/body/div[3]/div/div[3]/div[2]/ul/li/span[2]/a/@href").extract()for link in allLinks:link = urljoin(response.url, link)yield scrapy.Request(url=link, callback=self.info_parse)if (response.url == self.startLink):for pageNum in range(1, 6):link = '{0}_{1}.html'.format(self.startLink.split('.html')[0], pageNum)yield scrapy.Request(url=link, callback=self.link_parse)def info_parse(self, response):if 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472451.html' != response.url and 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472450.html' != response.url and 'http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472443.html' != response.url:item = NmgepbItem()trData = response.xpath('//table//tr')tableClass = self.customXpathParse(response.xpath('//table/@class').extract())if trData:for data in trData:tdNum = len(data.xpath('./td'))firstTd = self.customXpathParse(data.xpath('./td[1]//text()').extract())lastTd = self.customXpathParse(data.xpath('./td[6]//text()').extract())if (tdNum == 3):if (tableClass == 'MsoTableGrid'):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = ''item['mechanism'] = ''item['date'] = ''if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tableClass == 'FCK__ShowTableBorders'):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['place'] = ''item['company'] = ''item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 6) and (lastTd):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())item['date'] = self.customXpathParse(data.xpath('./td[6]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 5 or tdNum == 6) and (not lastTd):if firstTd.isdigit():item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelse:item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[1]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['mechanism'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['date'] = self.customXpathParse(data.xpath('./td[5]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 7):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[7]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelif (tdNum == 9):item['link'] = response.urlitem['title'] = self.customXpathParse(data.xpath('./td[2]//text()').extract())item['place'] = self.customXpathParse(data.xpath('./td[3]//text()').extract())item['company'] = self.customXpathParse(data.xpath('./td[4]//text()').extract())item['mechanism'] = ''item['date'] = self.customXpathParse(data.xpath('./td[9]//text()').extract())if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield itemelse:item['link'] = response.urlitem['title'] = self.customReParse(re.search(r'<strong>项目名称:</strong>(.*?)<', response.text, re.I))item['place'] = self.customReParse(re.search(r'<strong>建设地点:</strong>(.*?)<', response.text, re.I))item['company'] = self.customReParse(re.search(r'<strong>建设单位:</strong>(.*?)<', response.text, re.I))item['mechanism'] = self.customReParse(re.search(r'<strong>环境影响评价机构:</strong>(.*?)<', response.text, re.I))item['date'] = self.customReParse(re.search(r'<strong>受理日期:</strong>(.*?)<', response.text, re.I))if (item['title']) and (item['title'] != '编号') and (item['title'] != '项目名称'):print('{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n'.format(self.countNum, item['link'], item['title']))self.countNum += 1yield item

items:

import scrapyclass NmgepbItem(scrapy.Item):link = scrapy.Field()title = scrapy.Field()place = scrapy.Field()company = scrapy.Field()mechanism = scrapy.Field()date = scrapy.Field()

middlewares:

from scrapy import signalsclass NmgepbSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.
@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)class NmgepbDownloaderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects.
@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of#   installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

pipelines:

import os, csvclass NmgepbPipeline(object):def __init__(self):self.csvFilePath = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'nmgepb.csv')self.csvFile = open(self.csvFilePath, 'w', encoding='gb18030', newline='')self.csvWrite = csv.writer(self.csvFile)self.csvWrite.writerow(['页面链接', '项目名称', '建设地点', '建设单位', '评价机构', '受理日期'])def process_item(self, item, spider):self.csvWrite.writerow([item.get('link'), item.get('title'), item.get('place'), item.get('company'), item.get('mechanism'), item.get('date')])return itemdef close_spider(self, spider):self.csvFile.close()print("恭喜, 数据采集完成, 存储路径:%s"%self.csvFilePath)

settings(加入):

DEFAULT_REQUEST_HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"
}
ITEM_PIPELINES = {'nmgepb.pipelines.NmgepbPipeline': 300,
}

 

转载于:https://www.cnblogs.com/mayunji/p/8874311.html

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/398252.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

supervisor 守护多个进程_进程管理工具之Supervisor

相关介绍Supervisor是用Python 开发的一个client/server服务&#xff0c;是 Linux /Unix系统下的一个进程管理工具&#xff0c;不支持Windows系统。它可以很方便的监听、启动、停止、重启一个或多个进程。用Supervisor管理的进程&#xff0c;当一个进程意外被杀死&#xff0c;s…

MySQL运算符优先级顺序

待续。。。转载于:https://www.cnblogs.com/yuyue2014/p/4451035.html

yaahp层次分析法步骤_综合评价方法之层次分析法,选择再也难不倒你!

在日常生活中&#xff0c;我们常常面临着各种各样的选择。比如你想去北京、上海、广州旅游&#xff0c;但是由于种种原因&#xff0c;你只能选择一个地点去旅游&#xff0c;那么哪一种选择是最优的呢&#xff1f;有没有较为科学的方法帮助我们更好地选择呢&#xff1f;今天&…

TCP/IP TELNET SSH

快速导航 远程登录示例 关于远程登录&#xff1f; 嵌入&#xff1f; 用来控制远程服务器上运行的应用程序字符&#xff1f; 转义&#xff1f; 操作方式&#xff1f; 关于SSH&#xff1f; 远程登录示例 关于远程登录&#xff1f; 网络虚拟终端(Network Virtual Terminal,NVT)通过…

CentOs中iptables配置允许mysql远程访问

在CentOS系统中防火墙默认是阻止3306端口的&#xff0c;我们要是想访问mysql数据库&#xff0c;我们需要这个端口&#xff0c;命令如下&#xff1a; 1 /sbin/iptables -I INPUT -p tcp --dport 3036 -j ACCEPT 我们需要保存我们的操作&#xff0c;命令如下&#xff1a; 1 /etc…

莫兰迪紫rgb_莫兰迪rgb参数_译述北京 | RGB的前世今生

译述北京 | RGB的前世今生作为服饰三大要素之一的颜色 也常是女孩们着眼关注的话题不论是和谐统一还是创意满分的色彩搭配总是能让平平无奇的衣服具有吸引力 让人耳目一新从“泫雅风”的五彩斑斓到莫兰迪色系的低饱和度以及和bm风相辅相成的田园小清新色色彩在女孩们的《时尚经…

java 解决时间相减问题

比如 Date currentTime "2011-06-20 9:23:50"; Date endTime"2011-06-21 10:33:56"; 要求的结果是 endTime - currentTime 还剩余1天1小时10分6秒&#xff1b;如何来实现这个功能&#xff1f;&#xff1f; import java.text.SimpleDateFormat; import jav…

如何用 Python 实现 Web 抓取?

【编者按】本文作者为 Blog Bowl 联合创始人 Shaumik Daityari&#xff0c;主要介绍 Web 抓取技术的基本实现原理和方法。文章系国内 ITOM 管理平台 OneAPM 编译呈现&#xff0c;以下为正文。 随着电子商务的蓬勃发展&#xff0c;笔者近年越来越着迷于比价应用。我在网络上&…

sentinel 不显示项目_Sentinel+Nacos实现资源流控、降级、热点、授权

本文同名博客老炮说Java&#xff1a;https://www.laopaojava.com/&#xff0c;每天更新Spring/SpringMvc/SpringBoot/实战项目等文章资料SentinelNacos 是微服务环境搭建必不可少的两个组件&#xff0c;这里给大家推荐一套微服务教程&#xff1a;SpringCloud微服务电商项目教程…

Java获取当前日期的前一个月,前一天的时间

Calendar calendar Calendar.getInstance(); calendar.add(Calendar.DATE, -1); //得到前一天 calendar.add(Calendar.MONTH, -1); //得到前一个月 int year calendar.get(Calendar.YEAR); int month calendar.get(Calendar.MONTH)1; 注意月份加一 /…

斗地主你什么时候才会托管?(.NET中的托管于非托管)

文章部分引自《.NET4.0面向对象编程漫谈&#xff08;基础篇&#xff09;》第1章.NET面向对象编程基础&#xff08;作者&#xff1a;金旭亮&#xff09; 无意间看到一位四五岁左右小朋友在玩斗地主&#xff0c;总开始到结束&#xff0c;她一直都在使用“提示”&#xff08;托管&…

英特尔固态硬盘540s开卡_英特尔Z490主板绝配?技嘉PCIe 3.0 x8固态硬盘测试:6.5GB/s读速...

PCIe 4.0为高端NVMe固态硬盘提供了更广阔的提升空间&#xff0c;而暂时与PCIe 4.0无缘的英特尔平台就比较尴尬。技嘉推出的AORUS RAID SSD似乎正是为此而生&#xff0c;它以PCIe 3.0 x8接口提供6500MB/s顺序读取和6000MB/s顺序写入速度&#xff0c;足以媲美尚未面世的第二代PCI…

李洪强-C语言2-字符串

C语言字符串 一、字符串基础 注意&#xff1a;字符串一定以\0结尾。 Printf(“yang\n”); 其中yang为字符串常量&#xff0c;“yang”‘y’‘a’‘n’‘g’‘\0’。字符串由很多的字符组成&#xff0c;通常使用字符数组来存储字符串&#xff0c;如char name[10]“yang”;也可以…

【练习5.9】图像掩码、礼帽、cvCopy、图像融合、cvCvtColor

提纲题目要求程序代码结果图片要言妙道题目要求&#xff1a; 读入一副风景图&#xff0c;然后将其转化为灰度图像 a、对图像进行形态学“礼帽”操作&#xff0c;并显示结果 b、将结果图像转化为8位的掩码 c、复制灰度值到礼帽块中&#xff0c;显示结果→我的理解是&#xff0c;…

div超出不换行_文字超出显示点点点之ellipsis 设置

一般情况下一行文字在一定区域显示的话如果不限定高度&#xff0c;那么在内容超过的宽度的话就会换行显示&#xff0c;为了页面的美观&#xff0c;目前的通用做法就是在最后显示…&#xff0c;然后鼠标悬浮显示完整内容&#xff0c;具体设置如下:height: 40px; overflor: hidde…

Kali源库配置和拼音安装

2019独角兽企业重金招聘Python工程师标准>>> 版本&#xff1a;kali-linux-2016.1-i386 0、对kali进行系统更新或者软件安装&#xff0c;直接通过国内网络访问境外主网网络不稳定&#xff0c;不如改成国内资源站点。 1、修改以下文档 vim /etc/apt/sources.list 2、文…

nginx编译安装时添加echo模块

1. 首先下载模块源码&#xff1a;https://github.com/agentzh/echo-nginx-module/tags2. 解压到某个路径&#xff0c;假设为 /path/to/echo-nginx-module3. 使用下面命令编译并安装 Nginx$ wget http://sysoev.ru/nginx/nginx-1.0.11.tar.gz $ tar -xzvf nginx-1.0.11.tar.gz $…

java中审核订单流程图_Java 后端横扫阿里、滴滴、美团总结的面试经验!

这次面试的公司有一点点多&#xff0c;主要是因为毕业后前两份工作找的都很草率&#xff0c;这次换工作就想着&#xff0c;emm&#xff0c;毕业三年了&#xff0c;该找个工作好好沉淀几年了。先说下这次面试的结果吧&#xff1a;到 hr 面的&#xff1a;阿里、美团、滴滴、金山云…

产品铭牌要求_AMPULM:电力变压器铭牌有哪些主要技术参数,你都知道吗?

电力变压器电力变压器是供配电系统中关键的一个环节&#xff0c; 它起到电力系统中电压等级的变换&#xff0c;同时连接不同电压等级的电网&#xff0c;以利于电能的输送、分配和使用。认识变压器铭牌图片为一台电力变压器上的铭牌&#xff0c;从铭牌上可以看到变压器的哪些信息…

STM8L芯片启动时钟分频问题及发现(转)

源&#xff1a;STM8L芯片启动时钟分频问题及发现 转载于:https://www.cnblogs.com/LittleTiger/p/4460684.html