一. 内容简介
python爬虫爬取中关村在线电脑以及参数数据
二. 软件环境
2.1vsCode
2.2Anaconda
version: conda 22.9.0
三.主要流程
3.1 代码
解析都在代码里面
# 接口分析
# 原始接口,后面几个数字就是占位的,每个位置代表着不同的标签
# https://detail.zol.com.cn/notebook_index/subcate16_0_list_1_0_99_2_0_1.html
# https://detail.zol.com.cn/notebook_index/subcate16_0_list_1_0_99_2_0_3.html
# https://detail.zol.com.cn/notebook_index/subcate16_牌子_list_1_上市时间_99_排列方式_0_页码.html
# 联想 在中间加了160
# https://detail.zol.com.cn/notebook_index/subcate16_160_list_1_0_1_2_0_1.html
# 华为 在中间加了613
# https://detail.zol.com.cn/notebook_index/subcate16_613_list_1_0_1_2_0_1.html
# https://detail.zol.com.cn/notebook_index/subcate16_613_list_1_0_1_1_0_1.html
# 联想游戏本
# https://detail.zol.com.cn/notebook_index/subcate16_160_list_1_s1227_1_2_0_2.html
! pip install lxml
import urllib.request
from lxml import etree
import json
# 牌子,电脑类型,上市时间
def createRequext(brand,model,time,startPage):if brand == "华为":brand = "613"if brand == "联想":brand = "160"if brand == "惠普":brand = "223"if brand == "戴尔":brand = "21"if model == "游戏本":model = "s1227"if model == "商务本":model = "s1226"if time == "2022年下半年":time = "s10097-"if time == "2023年上半年":time = "s10098-"url = "https://detail.zol.com.cn/notebook_index/subcate16_" + brand +"_list_1_"+ time + model +"_1_1_0_"+ str(startPage) +".html"# 调试使用print(url)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}request = urllib.request.Request(url=url,headers=headers)return request# 获取网页源码
def getContent(request):response = urllib.request.urlopen(request)# 中关村在线,编码格式采用GBK,返回头里面写了这个编码方式content = response.read().decode('GBK')# print(content)return content# 下载数据
def downLoad(content):tree = etree.HTML(content)# 注释的是读取多列样式的,没注释是按行读取的# # 获取名字,把括号后面的内容裁掉# nameList = tree.xpath("//ul[@id='J_PicMode']//a/img/@alt")# for index,name in enumerate(nameList):# pos = nameList[index].find("(")# if pos != -1: # nameList[index] = nameList[index][0:pos] # pos = nameList[index].find("(")# if pos != -1: # nameList[index] = nameList[index][0:pos] # pos = nameList[index].find("/")# if pos != -1: # nameList[index] = nameList[index].replace('/', '_')# # print(nameList[index])# # 获取图片链接,懒加载,# imgList = tree.xpath("//ul[@id='J_PicMode']//a/img")# for index,img in enumerate(imgList):# # 拿到图片初始链接,并覆盖,原来并不能直接拿到.src属性# imgList[index] = img.get('.src')# # print(imgList[index])# 获取名字nameList = tree.xpath("//div[@class='list-box']//a//img/@alt")for index,name in enumerate(nameList):pos = nameList[index].find("(")if pos != -1: nameList[index] = nameList[index][0:pos] pos = nameList[index].find("(")if pos != -1: nameList[index] = nameList[index][0:pos] pos = nameList[index].find("/")if pos != -1: nameList[index] = nameList[index].replace('/', '_')print(nameList[index])# 获取图片链接,这个没有懒加载,imgList = tree.xpath("//div[@class='list-box']//a//img/@src")for index,img in enumerate(imgList):print(imgList[index])params = []# 获取详细参数paramList = tree.xpath("//div[@class='list-box']//a[@class='more']/@href")for index,param in enumerate(paramList):# https://detail.zol.com.cn/1397/1396968/param.shtmlparamList[index] = "https://detail.zol.com.cn" + param# print(index,paramList[index])param = laptopDetails(paramList[index])param["name"] = nameList[index]param["img"] = imgList[index]params.append(param)# # 下载# for i in range(len(nameList)):# name = nameList[i]# img = imgList[i]# print(str(i) + ":::" + name +" "+ img)# urllib.request.urlretrieve(url=img,filename="./img/"+name+".jpg")# 将列表数据转换为JSON格式字符串json_data = json.dumps(params, indent=4) # indent参数可选,用于格式化输出# 将JSON数据写入文件with open("data.json", "a") as json_file:json_file.write(json_data)print("JSON数据已保存到文件")brand = "华为" # "华为" "联想" "惠普" "戴尔"
model = "商务本" # "游戏本" "商务本"
time = "2022年下半年" # "2023年上半年" "2022年下半年"
with open("data.json", "w") as json_file:print("清空data数据")
startPage = 1
request = createRequext(brand,model,time,startPage)
content = getContent(request)
downLoad(content)
tree = etree.HTML(content)
num = tree.xpath("//span[@class='total']//b//text()")
endPage = int(int(num[0])/48) + 1
print(endPage)for page in range(startPage+1,endPage+1):# 请求对象定制request = createRequext(page)# 获取网页源码content = getContent(request)# 下载数据downLoad(content)# 下载完成print("下载完成!!!")
def laptopDetails(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}request = urllib.request.Request(url=url,headers=headers)response = urllib.request.urlopen(request)# 中关村在线,编码格式采用GBK,返回头里面写了这个编码方式content = response.read().decode('GBK')tree = etree.HTML(content)# 定义数据param = {# 处理器 cpu型号,核心数/线程数/频率"cpu":{ "model":"","frequency":"","thread":""},# 存储设备 内存类型,内润荣含量,硬盘容量ssd"memory":{"memorySize":"","memoryType":"","diskSize":"","diskType":"",},# 显示器 尺寸,分辨率,秘鲁"screen":{"size":"","ratio":"","refresh":"","detail":""},# 显卡 型号"gpu":{"model":""},# 接口"i_o":{"dataIo":"","videoIo":"","soundIo":"","otherIo":""}}# 读取cpu数据,第二张表num = tree.xpath("//table[2]//tr")for index in range(len(num)):left = tree.xpath("//table[2]//tr["+str(index+1)+"]//th//a/text()")if not left:# 当 left 为空时执行的操作left = tree.xpath("//table[2]//tr["+str(index+1)+"]//th//span/text()")right = tree.xpath("//table[2]//tr["+str(index+1)+"]//td//a/text()")if not right:# 当 right 为空时执行的操作right = tree.xpath("//table[2]//tr["+str(index+1)+"]//td//span/text()")# print(left,right)if index == 0:continueif left[0] == 'CPU型号':if right[0]:# 当 right[0] 不为空时执行的操作param["cpu"]["model"] = right[0]# print(param["cpu"]["model"])if left[0] == 'CPU主频':if right[0]:param["cpu"]["frequency"] = right[0]if left[0] == '最高睿频':if right[0]:param["cpu"]["frequency"] = param["cpu"]["frequency"] + "/" + right[0]if left[0] == '核心/线程数':if right[0]: param["cpu"]["thread"] = right[0]# 读取memory数据,第三张表num = tree.xpath("//table[3]//tr")for index in range(len(num)):left = tree.xpath("//table[3]//tr["+str(index+1)+"]//th//a/text()")if not left:# 当 left 为空时执行的操作left = tree.xpath("//table[3]//tr["+str(index+1)+"]//th//span/text()")right = tree.xpath("//table[3]//tr["+str(index+1)+"]//td//a/text()")if not right:# 当 right 为空时执行的操作right = tree.xpath("//table[3]//tr["+str(index+1)+"]//td//span/text()")# print(left,right)if index == 0:continueif left[0] == '内存容量':if right[0]:param["memory"]["memorySize"] = right[0]# print(param["cpu"]["model"])if left[0] == '内存类型':if right[0]:param["memory"]["memoryType"] = right[0]if left[0] == '硬盘容量':if right[0]:param["memory"]["diskSize"] = right[0]if left[0] == '硬盘描述':if right[0]:param["memory"]["diskType"] = right[0]# 读取screen数据,第四张表num = tree.xpath("//table[4]//tr")for index in range(len(num)):left = tree.xpath("//table[4]//tr["+str(index+1)+"]//th//a/text()")if not left:# 当 left 为空时执行的操作left = tree.xpath("//table[4]//tr["+str(index+1)+"]//th//span/text()")right = tree.xpath("//table[4]//tr["+str(index+1)+"]//td//a/text()")if not right:# 当 right 为空时执行的操作right = tree.xpath("//table[4]//tr["+str(index+1)+"]//td//span/text()")# print(left,right)if index == 0:continueif left[0] == '屏幕尺寸':param["screen"]["size"] = right[0]# print(param["cpu"]["model"])if left[0] == '屏幕分辨率':param["screen"]["ratio"] = right[0]if left[0] == '屏幕刷新率':param["screen"]["refresh"] = right[0]if left[0] == '屏幕技术':param["screen"]["detail"] = right[0]# 读取gpu数据,第五张表num = tree.xpath("//table[5]//tr")for index in range(len(num)):left = tree.xpath("//table[5]//tr["+str(index+1)+"]//th//a/text()")if not left:# 当 left 为空时执行的操作left = tree.xpath("//table[5]//tr["+str(index+1)+"]//th//span/text()")right = tree.xpath("//table[5]//tr["+str(index+1)+"]//td//a/text()")if not right:# 当 right 为空时执行的操作right = tree.xpath("//table[5]//tr["+str(index+1)+"]//td//span/text()")# print(left,right)if index == 0:continueif left[0] == '显卡类型':if right[0]:param["gpu"]["model"] = right[0]# print(param["cpu"]["model"])# 读取i_o数据,第八张表num = tree.xpath("//table[8]//tr")for index in range(len(num)):left = tree.xpath("//table[8]//tr["+str(index+1)+"]//th//a/text()")if not left:# 当 left 为空时执行的操作left = tree.xpath("//table[8]//tr["+str(index+1)+"]//th//span/text()")right = tree.xpath("//table[8]//tr["+str(index+1)+"]//td//a/text()")if not right:# 当 right 为空时执行的操作right = tree.xpath("//table[8]//tr["+str(index+1)+"]//td//span/text()")# print(left,right)if index == 0:continueif left[0] == '数据接口':if right[0]:param["i_o"]["dataIo"] = right[0]# print(param["cpu"]["model"])if left[0] == '视频接口':if right[0]:param["i_o"]["videoIo"] = right[0]if left[0] == '音频接口':if right[0]:param["i_o"]["soundIo"] = right[0]if left[0] == '其他接口':if right[0]:param["i_o"]["otherIo"] = right[0]# print(param["cpu"])# print(param["memory"])# print(param["screen"])# print(param["gpu"])# print(param["i_o"])return param# laptopDetails("https://detail.zol.com.cn/1399/1398668/param.shtml")
3.2 结果展示
这是保存到数据,用json保存的
[{"cpu": {"model": "Intel \u9177\u777fi5 12500H","frequency": "2.5GHz/4.5GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR4X\uff08\u4f4e\u529f\u8017\u7248\uff09","diskSize": "512GB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1\uff0c\u83b1\u8335TUV\u65e0\u9891\u95ea\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a1\u00d7USB Type-C\uff0c1\u00d7Thunderbolt4","videoIo": "HDMI>","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook 14s 2022","img": "https://i0-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/00/03/ChMkK2NbjVqIcP9XAAALk4AEbQIAAJAUgAr3VsAAAur116.jpg"},{"cpu": {"model": "Intel \u9177\u777f i7 1260P","frequency": "2.1GHz/4.7GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR5\uff08\u4f4e\u529f\u8017\u7248\uff095200MHz>","diskSize": "1TB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "10.7\u4ebf\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a2\u00d7Thunderbolt4","videoIo": "","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook X Pro 2022 12\u4ee3\u9177\u777f\u7248","img": "https://i3-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/0B/09/ChMkLGLhSFOIRKdBAAALdLA0Z58AAFt-gOySAoAAAuM163.jpg"},{"cpu": {"model": "Intel \u9177\u777f i7 1260P","frequency": "2.1GHz/4.7GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR5\uff08\u4f4e\u529f\u8017\u7248\uff095200MHz>","diskSize": "512GB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "10.7\u4ebf\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a2\u00d7Thunderbolt4","videoIo": "","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook X Pro 2022 12\u4ee3\u9177\u777f\u7248","img": "https://i4-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/0B/09/ChMkK2LhSFOIPYUUAAALdLA0Z58AAFt-gOxCakAAAuM444.jpg"},{"cpu": {"model": "Intel \u9177\u777fi5 12500H","frequency": "2.5GHz/4.5GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR4X\uff08\u4f4e\u529f\u8017\u7248\uff09","diskSize": "1TB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1\uff0c\u83b1\u8335TUV\u65e0\u9891\u95ea\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a1\u00d7USB Type-C\uff0c1\u00d7Thunderbolt4","videoIo": "HDMI>","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook 14s 2022","img": "https://i3-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/00/03/ChMkLGNbjVqIa9TpAAALk4AEbQIAAJAUgAvYeAAAAur503.jpg"},{"cpu": {"model": "Intel \u9177\u777fi5 12450H","frequency": "2GHz/4.4GHz","thread": "\u516b\u6838\u5fc3/\u5341\u4e8c\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR4X\uff08\u4f4e\u529f\u8017\u7248\uff09","diskSize": "512GB>","diskType": ""},"screen": {"size": "16\u82f1\u5bf8","ratio": "1920x1200","refresh": "60Hz>","detail": "DC\u8c03\u5149\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a1\u00d7USB2.0\uff0c1\u00d7USB3.2","videoIo": "HDMI>","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook D 16 SE","img": "https://i4-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/03/03/ChMkLGNiH-qIP2kgAAANdGksIagAAJMHQP-tPgAAA2M174.jpg"},{"cpu": {"model": "Intel \u9177\u777fi7 12700H","frequency": "2.7GHz/4.7GHz","thread": "14\u6838\u5fc3/20\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR4X\uff08\u4f4e\u529f\u8017\u7248\uff09","diskSize": "1TB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1\uff0c\u83b1\u8335TUV\u65e0\u9891\u95ea\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a1\u00d7USB Type-C\uff0c1\u00d7Thunderbolt4","videoIo": "HDMI>","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook 14s 2022","img": "https://i0-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/00/03/ChMkK2NbjVuIWbqqAAALk4AEbQIAAJAUgAwiIsAAAur286.jpg"},{"cpu": {"model": "Intel \u9177\u777fi5 1240P","frequency": "1.7GHz/4.4GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR5\uff08\u4f4e\u529f\u8017\u7248\uff095200MHz>","diskSize": "512GB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "10.7\u4ebf\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a2\u00d7Thunderbolt4","videoIo": "","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook X Pro 2022 12\u4ee3\u9177\u777f\u7248","img": "https://i3-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/0B/09/ChMkK2LhSFOIA3P8AAALdLA0Z58AAFt-gOyL2oAAAuM369.jpg"},{"cpu": {"model": "Intel \u9177\u777f i7 1260P","frequency": "2.1GHz/4.7GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR5\uff08\u4f4e\u529f\u8017\u7248\uff095200MHz>","diskSize": "1TB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "10.7\u4ebf\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a2\u00d7Thunderbolt4","videoIo": "","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook X Pro 2022 \u5fae\u7ed2\u5178\u85cf\u7248","img": "https://i0-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/0B/09/ChMkLGLhSFWIACpsAAAL26BNorcAAFt-gO2DjYAAAvz796.jpg"},{"cpu": {"model": "Intel \u9177\u777f i7 1260P","frequency": "2.1GHz/4.7GHz","thread": "12\u6838\u5fc3/16\u7ebf\u7a0b"},"memory": {"memorySize": "16GB","memoryType": "LPDDR5\uff08\u4f4e\u529f\u8017\u7248\uff095200MHz>","diskSize": "512GB>","diskType": "SSD\u56fa\u6001\u786c\u76d8>"},"screen": {"size": "14.2\u82f1\u5bf8","ratio": "\u66f4\u591a\u8d85\u9ad8\u6e05\u5c4f\u7b14\u8bb0\u672c>","refresh": "90Hz","detail": "10.7\u4ebf\uff0c\u83b1\u8335TUV\u786c\u4ef6\u7ea7\u4f4e\u84dd\u5149\u8ba4\u8bc1"},"gpu": {"model": "\u96c6\u6210\u663e\u5361>"},"i_o": {"dataIo": "\u5de6\u4fa7\uff1a2\u00d7Thunderbolt4","videoIo": "","soundIo": "\u8033\u673a/\u9ea6\u514b\u98ce\u4e8c\u5408\u4e00\u63a5\u53e3","otherIo": ""},"name": "HUAWEI MateBook X Pro 2022 \u5fae\u7ed2\u5178\u85cf\u7248","img": "https://i0-prosmall-fd.zol-img.com.cn/t_s160x120/g7/M00/0B/09/ChMkK2LhSFSIcUniAAAL26BNorcAAFt-gO2AkMAAAvz046.jpg"}
]