声明:案例只用于学习,不得恶意使用
要求:获取帖子的标题和链接
import requests
from lxml import etreeclass Tieba(object):def __init__(self,name):self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}def get_data(self,url):response = requests.get(self.url, headers=self.headers)return response.contentdef parse_data(self,data):html = etree.HTML(data)el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')print(len(el_list))def run(self):data = self.get_data(self.url)self.parse_data(data)if __name__ == '__main__':tieba = Tieba('龙之信条2')tieba.run()
运行后发现结果是0,因为内容被注释了
解决方法1:
更改User-Agent,将其改为低版本的浏览器
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'
解决方法2:
用正则表达式去掉注释
data = data.decode().replace("<!--", "").replace("-->", "")
完整代码:
import requests
from lxml import etreeclass Tieba(object):def __init__(self,name):self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)self.headers = {#'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36''User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'# 内容是被注释的,但低端浏览器不会进行注释}def get_data(self,url):response = requests.get(self.url, headers=self.headers)return response.contentdef parse_data(self,data):# 或者这样把注释删去#data = data.decode().replace("<!--", "").replace("-->", "")html = etree.HTML(data)el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')print(len(el_list))data_list = []for el in el_list:temp = {}temp['title'] = el.xpath('./text()')[0]temp['link'] = 'http://tieba.baidu.com' + el.xpath('./@href')[0]data_list.append(temp)try:next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]except:next_url = Nonereturn data_list,next_urldef save_data(self,data_list):for data in data_list:print(data)def run(self):while True:data = self.get_data(self.url)data_list, next_url = self.parse_data(data)self.save_data(data_list)print(next_url)if next_url == None:breakself.url = next_urlif __name__ == '__main__':tieba = Tieba('龙之信条2')tieba.run()
可完善的地方:save_data()方法,将数据保存为csv或xlsx文件
快来试试吧
可参考:【Python爬虫】基本操作中"数据存储——CSV文件"