python爬虫的练习 1.爬取天气网的北京城市历史天气数据 1.1 第一种使用面向对象OOP编写爬虫 1.2 第二种使用面向过程函数编写爬虫
1.爬取天气网的北京城市历史天气数据
1.1 第一种使用面向对象OOP编写爬虫
import re
import requests
from bs4 import BeautifulSoup
import xlwtclass Spider ( object ) : """ 天气数据爬虫类 """ datatime_pattern = re. compile ( r'<div class="th200">(.*?)</div>' ) wendu_pattern = re. compile ( r'<div class="th140">(.*?)</div>' ) def __init__ ( self, url, headers, filepath) : """ 初始化方法 :param url: 基础URL模板 :param headers: HTTP请求头 :param filepath: 输出文件路径 """ self. url = urlself. headers = headersself. datalist = [ ] self. mwen = [ ] self. iwen = [ ] self. tq = [ ] self. fx = [ ] self. filepath = filepath def download_page ( self, url) : """ 下载页面并返回页面内容 :param url: 要下载的页面URL :return: 页面内容或None(如果下载失败)""" try : response = requests. get( url, headers= self. headers) response. raise_for_status( ) return response. textexcept requests. RequestException as e: print ( f"Error downloading page: { e} " ) return None def parse_page ( self, html) : """ 解析页面内容,提取日期和温度数据 :param html: 页面内容 """ soup = BeautifulSoup( html, 'html.parser' ) for item in soup. find_all( 'ul' , class_= 'thrui' ) : item_str = str ( item) dates = re. findall( self. datatime_pattern, item_str) self. datalist. extend( dates) temperatures = re. findall( self. wendu_pattern, item_str) print ( temperatures) for i in range ( 0 , len ( temperatures) , 4 ) : self. mwen. append( temperatures[ i] ) self. iwen. append( temperatures[ i + 1 ] ) self. tq. append( temperatures[ i + 2 ] ) self. fx. append( temperatures[ i + 3 ] ) def download_and_parse_all_pages ( self) : """ 下载并解析所有页面 """ for year in range ( 23 , 24 ) : for month in range ( 1 , 2 ) : page_url = f" { self. url} 20 { year: 02d } { month: 02d } .html" print ( page_url) html = self. download_page( page_url) if html: self. parse_page( html) def save_to_excel ( self) : """ 将爬取的数据保存到Excel文件中 """ workbook = xlwt. Workbook( encoding= 'utf-8' , style_compression= 0 ) worksheet = workbook. add_sheet( '北京历史天气数据' , cell_overwrite_ok= True ) columns = ( "日期" , "最高温度" , "最低温度" , "天气" , "风向" ) for i, col in enumerate ( columns) : worksheet. write( 0 , i, col) for i in range ( len ( self. datalist) ) : worksheet. write( i + 1 , 0 , self. datalist[ i] ) worksheet. write( i + 1 , 1 , self. mwen[ i] ) worksheet. write( i + 1 , 2 , self. iwen[ i] ) worksheet. write( i + 1 , 3 , self. tq[ i] ) worksheet. write( i + 1 , 4 , self. fx[ i] ) workbook. save( self. filepath) print ( f"Data saved to { self. filepath} " ) def run ( self) : self. download_and_parse_all_pages( ) self. save_to_excel( ) if __name__ == '__main__' : headers = { 'User-Agent' : 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)' } url_template = "http://lishi.tianqi.com/beijing/" filepath = "beijing_weather_data.xls" spider = Spider( url_template, headers, filepath) spider. run( )
1.2 第二种使用面向过程函数编写爬虫
import requests
from bs4 import BeautifulSoup
import re
import xlwtdatatime = re. compile ( '<div class="th200">(.*?)</div>' )
wendu = re. compile ( '<div class="th140">(.*?)</div>' ) def down_allpage ( url) : datalist = [ ] mwen = [ ] iwen = [ ] tq = [ ] fx = [ ] for i in range ( 23 , 24 ) : for j in range ( 1 , 2 ) : baseurl = url + '20{}{:0>2d}.html' . format ( i, j) html = down_page( baseurl) soup = BeautifulSoup( html, 'html.parser' ) for item in soup. find_all( 'ul' , class_= 'thrui' ) : item = str ( item) riqi = re. findall( datatime, item) for item1 in riqi: datalist. append( item1) zb_all = re. findall( wendu, item) for i in range ( 31 ) : mwen. append( zb_all[ i* 4 + 0 ] ) iwen. append( zb_all[ i* 4 + 1 ] ) tq. append( zb_all[ i* 4 + 2 ] ) fx. append( zb_all[ i* 4 + 3 ] ) return datalist, mwen, iwen, tq, fxdef save_xls ( datalist, mwen, iwen, tq, fx) : wb = xlwt. Workbook( encoding= 'utf-8' , style_compression= 0 ) ws = wb. add_sheet( '天气数据' , cell_overwrite_ok= True ) col = ( "日期" , "最高温度" , "最低温度" , "天气" , "风向" ) for i in range ( len ( col) ) : ws. write( 0 , i, col[ i] ) for i in range ( len ( datalist) ) : ws. write( i+ 1 , 0 , datalist[ i] ) for i in range ( len ( mwen) ) : ws. write( i+ 1 , 1 , mwen[ i] ) for i in range ( len ( iwen) ) : ws. write( i+ 1 , 2 , iwen[ i] ) for i in range ( len ( tq) ) : ws. write( i+ 1 , 3 , tq[ i] ) for i in range ( len ( fx) ) : ws. write( i+ 1 , 4 , fx[ i] ) wb. save( r'D:\天气数据.xls' ) def down_page ( url) : headers = { 'User-Agent' : 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)' } r = requests. get( url, headers= headers) html = r. textreturn htmlif __name__ == '__main__' : url = 'http://lishi.tianqi.com/beijing/' down_allpage( url) datalist, mwen, iwen, tq, fx = down_allpage( url) print ( datalist) save_xls( datalist, mwen, iwen, tq, fx)