Python3 实现游戏主播人气排行榜
from urllib import request import re# 爬取某个游戏主播的人气(每个游戏主播的观看人数)排行榜''' 爬虫前奏:明确爬虫目的找到数据对应的网页分析网页的结构找到数据所在的标签位置模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML利用正则表达式提取我们要的数据(主播名字,人气) 概括字符集:\d \D\w 单词字符 \W\s 空白字符 \S. 匹配除换行符\n之外的所有字符 爬虫框架:ScrapyBeautifulSoup 进阶:爬虫大数据的存储数据的分析 常见问题:反爬虫反反爬虫IP被封代理IP库 '''class Spider():url = 'https://www.panda.tv/cate/lol'root_pattern = '<div class="video-info">([\s\S]*?)</div>'name_pattern = '</i>([\s\S]*?)</span>'number_pattern = '<span class="video-number">([\s\S]*?)</span>'def __fetch_content(self):r = request.urlopen(Spider.url)# byteshtmls = r.read()htmls = str(htmls, encoding='utf-8')return htmlsdef __analysis(self, htmls):root_html = re.findall(Spider.root_pattern, htmls)anchors = []for html in root_html:name = re.findall(Spider.name_pattern, html)number = re.findall(Spider.number_pattern, html)anchor = {'name': name, 'number': number}anchors.append(anchor)print(anchors[0])return anchorsdef __refine(self, anchors):jl = lambda anchors: {'name': anchors['name'][0].strip(),'number': anchors['number'][0].strip()}return map(jl, anchors)def __sort(self, anchors):# filteranchors = sorted(anchors, key=self.__sort_seed, reverse=True)return anchorsdef __sort_seed(self, anchor):r = re.findall('\d*\.\d*', anchor['number'])number = float(r[0])if '万' in anchor['number']:number *= 10000return numberdef __show(self, anchors):for rank in range(0, len(anchors)):print('rank ' + str(rank + 1)+ ':' + anchors[rank]['name']+ ' ' + anchors[rank]['number'])def go(self):htmls = self.__fetch_content()anchors = self.__analysis(htmls)anchors = list(self.__refine(anchors))anchors = self.__sort(anchors)self.__show(anchors)spider = Spider() spider.go()