用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子,并且将该这个帖子里每个楼层发布的图片下载到本地。
import requests
from lxml import etree
import jsonclass Tieba : def __init__ ( self, tieba_name) : self. tieba_name = tieba_name self. headers = { "User-Agent" : "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" } def get_total_url_list ( self) : '''获取所有的urllist''' url = "https://tieba.baidu.com/f?kw=" + self. tieba_name+ "&ie=utf-8&pn={}&" url_list = [ ] for i in range ( 100 ) : url_list. append( url. format ( i* 50 ) ) return url_list def parse_url ( self, url) : '''一个发送请求,获取响应,同时etree处理html''' print ( "parsing url:" , url) response = requests. get( url, headers= self. headers, timeout= 10 ) html = response. content. decode( ) html = etree. HTML( html) return htmldef get_title_href ( self, url) : '''获取一个页面的title和href''' html = self. parse_url( url) li_temp_list = html. xpath( "//li[@class='tl_shadow']" ) total_items = [ ] for i in li_temp_list: href = "https:" + i. xpath( "./a/@href" ) [ 0 ] if len ( i. xpath( "./a/@href" ) ) > 0 else None text = i. xpath( "./a/div[1]/span[1]/text()" ) text = text[ 0 ] if len ( text) > 0 else None item = dict ( href = href, text = text) total_items. append( item) return total_items def get_img ( self, url) : '''获取一个帖子里面的所有图片''' html = self. parse_url( url) img_list = html. xpath( '//div[@data-class="BDE_Image"]/@data-url' ) img_list = [ i. split( "src=" ) [ - 1 ] for i in img_list] img_list = [ requests. utils. unquote( i) for i in img_list] return img_listdef save_item ( self, item) : '''保存一个item''' with open ( "teibatupian.txt" , "a" ) as f: f. write( json. dumps( item, ensure_ascii= False , indent= 2 ) ) f. write( "\n" ) def run ( self) : url_list = self. get_total_url_list( ) for url in url_list: total_item = self. get_title_href( url) for item in total_item: href = item[ "href" ] img_list = self. get_img( href) item[ "img" ] = img_listprint ( item) self. save_item( item) if __name__ == "__main__" : tieba = Tieba( "猫" ) tieba. run( )