广西南宁政府门面网站
import requests import os import io import numpy as np from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup import time import pdfplumber import pandas as pd from docx import Document import docx import win32com.client as win32 import zipfile import xlrd headers=[{'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'},{'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76'} ] temp='' with open('D:/t.txt','r',encoding='utf-8') as f:temp=f.read() contents=[] def get_pdf_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')if 'D:/308 南宁市卫生和计划生育委员会2018年部门预算及“三公”经费预算/308南宁市卫生和计划生育委员会及所属单位2018年部门预算及“三公”经费预算公开.pdf'==f'D:/{title}':return ''texts=[]with pdfplumber.open(f'D:/{title}') as pdf:for page in pdf.pages:text = page.extract_text()#提取文本texts.append(text)return ' '.join(texts) def doc_to_docx(title):word = win32.Dispatch("Word.Application")doc = word.Documents.Open('D:\\'+title)doc.SaveAs('D:\\'+title+'x')doc.Close()word.Quit() def get_doc_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')#doc_to_docx(title)texts=[]if f'D:/{title}'=='D:/市工信委2017年部门预算及“三公”经费公开.docx' or 'D:/2017年部门预算公开-政府办公厅.docx'==f'D:/{title}' or 'D:/政务办2017年部门预算及“三公”经费公开 (1).doc'==f'D:/{title}' or 'D:/南宁市茅桥地区人民检察院2018年部门预算及“三公”经费预算.doc'==f'D:/{title}' or 'D:/017 南宁市审计局2018年部门预算及“三公”经费预算.doc'==f'D:/{title}' or 'D:/南宁市人大常委会办公厅2018年部门预算及“三公”经费预算 (1).doc'==f'D:/{title}' or 'D:/2017年市民宗委预算公开 (1).doc'==f'D:/{title}' or 'D:/预算 名词解释.docx'==f'D:/{title}' or 'D:/南宁市编办2017年部门预算公开名词解释.docx'==f'D:/{title}' or 'D:/南宁市编办2016年决算收支增减变化情况说明.docx'==f'D:/{title}':return ''app = win32.DispatchEx("Word.Application")doc = app.Documents.Open(f'D:/{title}')content = doc.Content.Textapp.Quit()return content def get_xls_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')try: # 尝试打开文件 with open(f'D:/{title}', 'rb') as file:passexcept Exception as e: return ''if f'D:/{title}'=='D:/附件—市发展改革委2018年预算公开附件.xlsx' or f'D:/{title}'=='D:/2017预算公开附件-市人大.xls' or 'D:/南宁市科协2020年部门预算公开附件.xlsx'==f'D:/{title}' or 'D:/宣传部2017年预算公开附件0309.xlsx'==f'D:/{title}':return ''if title.split('.')[-1]=='xls':df = pd.read_excel(f'D:/{title}',engine='xlrd')elif title.split('.')[-1]=='xlsx':df = pd.read_excel(f'D:/{title}',engine='openpyxl')else:return ''return df.to_string(index=False) def solve_file(file_type,file_url,title):content=''if file_type=='pdf':content=get_pdf_content(file_url,title)elif file_type=='doc' or file_type=='docx':content=get_doc_content(file_url,title)elif file_type=='xls' or file_type=='xlsx' or file_type=='XLS':content=get_xls_content(file_url,title)return content def unzip_file(save_file,unzip_dir_path): # 打开压缩包file_name=[]with zipfile.ZipFile(save_file, 'r') as zip_file:# 获取所有文件列表for zip_info in zip_file.infolist():# 如果是文件,先将文件名从gbk编码转换为utf-8编码# print(type(zip_info.filename.encode('cp437').decode('gbk').encode('utf-8')))zip_info.filename = zip_info.filename.encode('cp437').decode('gbk')# 解压文件zip_file.extract(zip_info, unzip_dir_path)file_name.append(zip_info.filename)return file_name def get_file(url,base,year,date):time.sleep(10)r=requests.get(url,headers=np.random.choice(headers))#利用request的get函数连接到网址if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser') hrefs=soup.find('div',class_='downfile').find_all('a')for href in hrefs:file_url=base+href['href']file_type=file_url.split('.')[-1]#获取文件类型print(file_type)title=href.textif '.' not in title:title+='.'+file_typet=solve_file(file_type,file_url,title)if t!='':content=[]content.append(year)content.append(date)content.append('广西')content.append('南宁市')content.append(title)content.append(t)content.append('预算公开')content.append(file_url)contents.append(content)print(content)elif file_type=='zip':response = requests.get(file_url)with open(f'D:/{title}', 'wb') as f:f.write(response.content)print(f'D:/{title}')if 'D:/17年预算信息公开.zip'==f'D:/{title}' or 'D:/卫计委2017年部门预算公开.zip'==f'D:/{title}':continuefile_name=unzip_file(f'D:/{title}','D:/')for name in file_name:path='D:/'+namezip_file_type=path.split('.')[-1]t=tempif zip_file_type=='xls' or zip_file_type=='xlsx' or zip_file_type=='doc' or zip_file_type=='XLS':passelse:t=solve_file(zip_file_type,'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/2017bmys/P020171030399005252620.doc',name)content=[]content.append(year)content.append(date)content.append('广西')content.append('南宁市')content.append(path.split('/')[-1])content.append(t)content.append('预算公开')content.append(file_url)contents.append(content)print(content)continueelif file_type=='rar':continuedef get_url(urls,year):#每年的总页面time.sleep(10)r=requests.get(urls,headers=np.random.choice(headers))if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser') hrefs=soup.find('div',class_='nav1Cont').find_all('li')for href in hrefs:date=href.find('span',class_='time').texthref=href.find('a')base=href['href']url=base_urls+basetitle=href.text.strip()net_type=url.split('.')[-1]if net_type=='pdf':content=[]content.append(year)content.append(date)content.append('广西')content.append('南宁市')content.append(title)t=get_pdf_content(url,title+'.pdf')if t.strip()=='':t=tempcontent.append(t)content.append('预算公开')content.append(url)contents.append(content)print(content)continueget_file(url,base_urls,year,date)for year in range(2018,2023):base_urls=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/'get_url(base_urls,year)for i in range(1,7):url=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/index_{i}.html'r=requests.get(url)if r.status_code!=200:breakget_url(url,year)df=pd.DataFrame(contents,columns=['年份','发布日期','省份','城市','标题','文本','类型','下载链接']) df.to_excel('D:/广西-南宁-部门预算2017.xlsx',index=False)
爬取网页附件,根据文件类型分类处理,显示文件内容并制成表格