Python-爬虫案例
- 代码
- 代码
代码
import requests
import json
import threading
from queue import Queue
import timeclass HeiMaTouTiao:def __init__(self):self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/92.0.4515.107 Safari/537.36",'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIU''zI1NiJ9.eyJleHAiOjE2NTY2NTk3NjcsInVzZXJfaWQiOjEsInJlZn''Jlc2giOmZhbHNlLCJ2ZXJpZmllZCI6dHJ1ZX0.ZSdV5mT6w_yhEKLg''qcvWNln2GKHBxfxK7d8YXaoCMYg'}# URL队列self.url_queue = Queue()# 提取的内容队列self.content_queue = Queue()def get_url_list(self, start_page, end_page):url_temp = 'http://api-toutiao-web.itheima.net/mp/v1_0/articles?' \'page={}&per_page=10&response_type=comment'url_list = [url_temp.format(i) for i in range(start_page, end_page + 1)]for url in url_list:print('正在请求:', url)self.url_queue.put(url)def get_data(self):content_li = []while True:url = self.url_queue.get()comment = requests.get(url=url, headers=self.headers).textdata = json.loads(comment)data = data['data']['results']for index in range(len(data)):content = dict()content['标题'] = data[index]['title']if data[index]['comment_status'] is True:content['评论状态'] = '正常'else:content['评论状态'] = '关闭'content['总评论数'] = data[index]['total_comment_count']content['粉丝评论数'] = data[index]['fans_comment_count']content_li.append(content)self.content_queue.put(content_li)self.url_queue.task_done()def save_data(self):while True:content_list = self.content_queue.get()with open('toutiao.json', mode='a+', encoding='utf-8')as f:f.write(json.dumps(content_list, ensure_ascii=False, indent=2))self.content_queue.task_done()def run(self):start_page = int(input('请输入抓取的起始页:'))end_page = int(input('请输入抓取的结束页:'))# 线程列表t_list = []if start_page <= 0:print('抓取的起始页从1开始。')else:t_url = threading.Thread(target=self.get_url_list, args=(start_page, end_page))t_list.append(t_url)# 提取内容线程for i in range(9):t_content = threading.Thread(target=self.get_data)t_list.append(t_content)# 保存数据t_save = threading.Thread(target=self.save_data)t_list.append(t_save)for t in t_list:t.setDaemon(True)t.start()for q in [self.url_queue, self.content_queue]:q.join()if __name__ == '__main__':heimatoutiao = HeiMaTouTiao()start_time = time.time()heimatoutiao.run()end_time = time.time()print(f'总用时:{end_time - start_time}秒')
这段Python代码定义了一个名为 HeiMaTouTiao 的类,用于爬取和保存某学习网站上的文章信息。
代码
import requests
import json
from pymongo import MongoClient
class LittleRabbit:def __init__(self):# 准备车载用品类页面的URLself.init_url = 'https://apipc-xiaotuxian-front.itheima.net/category/goods/temporary'# 请求头self.headers = {"Content-Type": "application/json;charset=utf-8",'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)''AppleWebKit/537.36 (KHTML, like Gecko)''Chrome/90.0.4430.212 Safari/537.36'}# 连接MongoDB的客户端self.client = MongoClient('127.0.0.1', 27017)def load_category_page(self, page):"""抓取车载用品类商品展示页面的数据:param page:待抓取的页码数:return:车载用品类下的所有商品"""# 准备请求体request_payload = {"page": page, "pageSize": 20, "categoryId": "1005009"}# 将字典form_data转换为JSON字符串json_data = json.dumps(request_payload)response = requests.post(url=self.init_url, data=json_data,headers=self.headers)# 将服务器返回的JSON字符串先转换成字典,再获取字典中的商品信息all_goods = json.loads(response.text)["result"]["items"]return all_goodsdef load_detail_page(self, all_goods):"""抓取商品详情页的数据:param all_goods: 车载用品类下的所有商品:return: 所有商品的详情信息"""# 准备基本URLbase_url = 'https://apipc-xiaotuxian-front.itheima.net/goods?'# 定义一个数组,保存所有商品的详情信息goods_detail_info = []for good_info in all_goods:# 提取商品的ID标识good_id = dict(id=good_info['id'])# 根据拼接商品详情页的完整URL,发送GET请求response = requests.get(url=base_url, params=good_id)# 将服务器返回的JSON数据转换为字典good_detail = json.loads(response.text)goods_detail_info.append(good_detail)return goods_detail_infodef parse_page(self, detail_data):"""解析商品详情页的数据,提取目标数据:param detail_data:所有商品的详情数据:return:所有商品的信息"""# 定义一个列表,保存所有商品的信息all_goods_info = []temp_url = 'http://erabbit.itheima.net/#/product/'for info in detail_data:dict_data = dict()dict_data['商品名称'] = info['result']['name']dict_data['商品描述'] = info['result']['desc']dict_data['商品链接'] = temp_url + info['result']['id']dict_data['商品价格'] = info['result']['price']# 获取详情页面中的第一张图片dict_data['商品图片'] = info['result']['mainPictures'][0]good_detail = info['result']['details']['properties']dict_data['商品详情'] = ''.join([':'.join(info.values()) + '\n' for info in good_detail])all_goods_info.append(dict_data)return all_goods_infodef save_data(self, goods_info):"""存储商品详情的数据:param get_goods_info:"""# 建立连接到本地的MongoDBclient = self.client# 访问/创建数据库rabbitdb = client.rabbitry:for good in goods_info:# 创建集合little_rabbit,并在该集合中插入文档对象db.little_rabbit.insert_one(good)print('保存成功')# 访问集合中的文档对象result = db.little_rabbit.find()for doc in result:print(doc)except Exception as error:print(error)def run(self):"""启动网络爬虫,控制网络爬虫的执行流程"""begin_page = int(input('起始页码:'))end_page = int(input('结束页码:'))if begin_page <= 0:print('起始页码从1开始')else:for page in range(begin_page, end_page + 1):print(f'正在抓取第{page}页')all_goods = self.load_category_page(page)goods_detail = self.load_detail_page(all_goods)goods_info = self.parse_page(goods_detail)self.save_data(goods_info)if __name__ == '__main__':lr = LittleRabbit()lr.run()
用于爬取和存储指定网站上的商品信息到MongoDB数据库