原文链接:http://cuiqingcai.com/993.html
划重点:
1.提取帖子内容时,对图片,贴吧自动增加的超链接,制表符,换行符要做删除或替换处理
2.decode是把bytes转换为str, encode是把str转换为bytes 原帖中的代码第100行多了一个encode,导致出错
3.代码中用到了文件相关操作
4.原文中获取标题的正则表达式我觉得不太对,做了修改。原文只是提取了<h1></h1>直接的,但实际上有在<h3></h3>直接的
最终代码如下,在python3.4.3中实现
import urllib.request import urllib.parse import re#处理页面标签类 class Tool:#去除img标签,7位长空格removeImg = re.compile('<img.*?>| {7}')#删除超链接标签removeAddr = re.compile('<a.*?>|</a>')#把换行的标签换为\nreplaceLine = re.compile('<tr>|<div>|</div>|</p>')#将表格制表<td>替换为\treplaceTD = re.compile('<td>')#把段落开头换为\n加空两格replacePara = re.compile('<p.*?>')#将换行符或双换行符替换为\nreplaceBR = re.compile('<br><br>|<br>')#将其余标签剔除removeExtraTag = re.compile('<.*?>')def replace(self, x):x = re.sub(self.removeImg, "", x)x = re.sub(self.removeAddr, "", x)x = re.sub(self.replaceLine, "\n", x)x = re.sub(self.replaceTD, "\t", x)x = re.sub(self.replacePara, "\n ", x)x = re.sub(self.replaceBR, "\n", x)x = re.sub(self.removeExtraTag, "", x)return x.strip()#百度贴吧爬虫类 class BDTB:#初始化,传入基地址,是否只看楼主的参数def __init__(self, baseUrl, seeLZ,floorTag):self.baseURL = baseUrlself.seeLZ = '?see_lz='+str(seeLZ)self.tool = Tool()self.file = Noneself.floor = 1self.defaultTitle = u"百度贴吧"self.floorTag = floorTag#传入页码,获取该页帖子的代码def getPage(self, pageNum):try:url = self.baseURL+self.seeLZ+'&pn='+str(pageNum)request = urllib.request.Request(url)response = urllib.request.urlopen(request)return response.read().decode('utf-8','ignore') #注意转换成字符串except urllib.error.URLError as e:if hasattr(e, "reason"):print(u"连接百度贴吧失败,错误原因:", e.reason)return None#获取帖子标题def getTitle(self):pageCode = self.getPage(1)pattern = re.compile('''<h\d class="core_title_txt.*?title="(.*?)" style="width:.*?</h\d>''', re.S)result = re.search(pattern, pageCode)if result:title = result.group(1).strip() #这里注意,获取分组的方法return titleelse:return None#提取帖子页数def getPageNum(self):pageCode = self.getPage(1)pattern = re.compile('''<span class=.*?</span>.*?回复贴,共.*?<span class=.*?>(.*?)</span>''', re.S)result = re.search(pattern, pageCode)if result:pageNum = result.group(1).strip()return pageNumelse:return None#获取每一层楼的内容,传入页面内容def getContent(self, page):pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)items = re.findall(pattern, page)contents = []for item in items:content = "\n"+self.tool.replace(item)+"\n"contents.append(content)return contentsdef setFileTitle(self, title):if title is not None:self.file = open(title+".txt","w+")else:self.file = open(self.defaultTitle+".txt","w+")def writeData(self, contents):#向文件写入每一楼的信息for item in contents:if self.floorTag == '1':#楼之间的分隔符floorLine = "\n" + str(self.floor) + "楼-------------------------------------"self.file.write(floorLine)self.file.write(item)self.floor+=1def start(self):pageNum = self.getPageNum()title = self.getTitle()self.setFileTitle(title)if pageNum == None:print(u"URL已失效,请重试")return try:print("该帖子共有" + str(pageNum) + "页")for i in range(1, int(pageNum) + 1):print("正在写入第"+str(i)+"页数据")page = self.getPage(i)contents = self.getContent(page)self.writeData(contents)#出现写入异常except IOError as e:print("写入异常,原因"+e.message)finally:print("写入任务完成")print(u"请输入帖子代号") baseURL = 'http://tieba.baidu.com/p/' + str(input(u'http://tieba.baidu.com/p/')) seeLZ = input("是否只看楼主发言,是输入1,否输入0\n") floorTag = input("是否写入楼层信息,是输入1,否输入0\n") bdtb = BDTB(baseURL, seeLZ, floorTag) bdtb.start()