我自己写的dcd爬虫,这个网站比较简单。看了看别人的程序,觉得用起来挺别扭,就自己捣鼓了一天。弄出来了。
这个网站没有反爬,有一些是动态网页,有一些是静态。
首先,获取销量排行榜前300的车型。
import os
import json
import requests
from parsel import Selector# ---------------------------------------------------------#
# ---- * 获得车辆销售排行榜前300、100的车 * ----#
# ---------------------------------------------------------#url = "https://www.dongchedi.com/motor/pc/car/rank_data"
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",}def get_param(page):params = {"aid": "1839","app_name": "auto_web_pc","city_name": "烟台","count": "10","offset": page,"month": "","new_energy_type": "","rank_data_type": "11","brand_id": "","price": "","manufacturer": "","outter_detail_type": "","nation": "0"}return paramsdef get_response(pageNum):params = get_param(str(pageNum * 10))with requests.get(url=url, headers=headers, params=params, verify=False) as resp:resp.raise_for_status()print(resp.status_code)return respdata_list = []
for i in range(30):print(f"销量前{i * 10} 的车")response = get_response(i)data_list.append(response.json())
获取之后,就能访问该车型,一般一个车型有好多款式,我的目的是向比较一些车型的尺寸,所以一个车型就选第一种款式,访问进入该车型第一种款式的参数配置,这样把参数下载下来,放到一个文件里,就可以比较现在卖的车的尺寸情况。
第二部分,我尝试了一下动态请求车型的价格。不过这一部分后面数据分析没有用到。
len(data_list)
import jsonpath
data_list[0]['data']['list'][0]['series_name']name_list = jsonpath.jsonpath(data_list, "$..series_name")
id_list = jsonpath.jsonpath(data_list, "$..series_id")
id_list
first_list = jsonpath.jsonpath(data_list, "$..online_car_ids")first_list[0][0]car_id_list = []
for ls in first_list:if ls:first_id = ls[0]else:first_id = Nonecar_id_list.append(first_id)
len(car_id_list)import pandas as pd
df = pd.DataFrame({"name": name_list,"series": id_list,"first_id":car_id_list
})dfdf[df['first_id'] == None]df2 = df.dropna()
df.shape
df2.shapedf2.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df = pd.read_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df.keys()
df.columns
df.columns = ['rank', 'name', 'series', 'first_id']
df.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")# ---------------------------------------------------------#
# ---- * 价格 * ----#
# ---------------------------------------------------------#first_iddef get_price(car_id):import jsonimport oswk_dir = "Pythn-Anlys-138/dcd"# fpath = wk_dir + "/" + car_id + ".csv"fname = car_id + ".json"url = "https://www.dongchedi.com/motor/pc/car/series/car_dealer_price"headers = {。。。}params = {"aid": "1839","app_name": "auto_web_pc","car_ids": car_id,"city_name": "烟台"}with requests.get(url=url, headers=headers, params=params, verify=False) as resp:resp.raise_for_status()# print(resp.json())rj = resp.json()with open(os.path.join(wk_dir, fname), 'w', encoding="utf-8") as f:f.write(json.dumps(rj, ensure_ascii=False))print(f"保存文件成功 {car_id} !!!")first_id = str(first_id)get_price(first_id)
这一部分呢后期没什么用,代码也很乱。
第三部分,获取某一车型的第一种款式的参数。
# ---------------------------------------------------------#
# ---- * 参数配置 * ----#
# ---------------------------------------------------------#from parsel import Selectordef get_detail_page(id):url = "https://www.dongchedi.com/auto/params-carIds-" + idheaders = {
。。。
}with requests.get(url=url, headers=headers, verify=False) as resp:resp.raise_for_status()# print(resp.text)return resp.texthtml = get_detail_page(id)htmlselector = Selector(html)selector.css('div[data-row-anchor]')
len(selector.css('div[data-row-anchor]'))all_rows = selector.css('div[data-row-anchor]')dct_list = []
for row in all_rows:dct_item = {}label = row.css('div:nth-child(1) label::text').get()value = row.css('div:nth-child(2) div::text').get()dct_item[label] = valuedct_list.append(dct_item)dct_listfirst_row = all_rows[0]def parse_detail(id):html = get_detail_page(id)selector = Selector(html)all_rows = selector.css('div[data-row-anchor]')dct_list = []for row in all_rows:dct_item = {}label = row.css('div:nth-child(1) label::text').get()value = row.css('div:nth-child(2) div::text').get()dct_item[label] = valuedct_list.append(dct_item)dct_detail = {"id":id,"detail":dct_list}return dct_detaildct_detail = parse_detail(id)
dct_detailfirst_id_listdef save_detail(id, dct_detail):fname = id + "_dcd_detail.json"with open(os.path.join("Pythn-Anlys-138/dcd", fname), 'w', encoding='utf8') as f:f.write(json.dumps(dct_detail, ensure_ascii=False))print(f"Detail file {id} saved!!!")for fid in first_id_list:dct_detail = parse_detail(fid)save_detail(fid, dct_detail)
最后,下载了一些json文件。后期做了一些数据整理。做成了数据表是这样的。
结果还不错。