思路
写文章的时间2023-8-4,大部分网页设置的区域都是先是省,然后通过省获取对应的市,再通过市获取对应的区,以此类推。所以模拟的请求也是按照这个逻辑,先获取所有的省,再获取所有的市,最后获取所有的区。
实现方法
博主其实是需要根据用户指定的地区做一些其他的请求,但是用户不可能知道这个地区对应的id,所以直观的展示就想到:用区域的名作为键,id作为值,但是区域名字可能会相同,例如不同的市但存在同名的区,所以最终想到用嵌套字典的方法来进行存储,因为不可能省市区都是一样的。
所以最终字典的样子是这样子的:
代码
代码解释
获取区域对应的id和名字,走了三次循环进行获取,定义一个字典,将获取到的区域的名字作为键,值也是一个字典,这个字典就包含当前区域的id,然后下一级的区域名字,如此嵌套。字典的样子:
{'广东':{"id": "19","潮州市": {"id": "1705","潮安区": {"id": "19992",},},"东莞市": {"id": "1655","常平镇": {"id": "4886",},"长安镇": {"id": "4760",},},},'其他省份':{'以此类推'}
}
具体详细代码
对于数据解析部分,由于每个网页返回的结构不一样,需要自己去解析对应的数据
# -*- coding: utf-8 -*-
# @Time : 2023/7/13 15:05
# @Author : gongzairen
# @File : get_areas_id_dict.pyimport json
import random
import timeimport requestsclass GetAreasId:def __init__(self,sku_id):self.sku_id = sku_idself.data_dict = {}# 获取区域iddef get_areas_id(self):""":return: 返回的是包含省,市,区的id 的dict逻辑很混乱多层嵌套,效果并不好样例:{'北京': {'id': '2', '北京市': {'id': '36', '东城区': {'id': '377'}, }},'天津': {'id': '3', '天津市': {'id': '37', '和平区': {'id': '395'},}}}"""data_dict = {"data": "","error_info": "无",'warning_info': "无",}url = '对应的url'headers = HEADERSheaders['Referer'] = f'对应的referer'headers['Accept'] = '对应的Accept'all_areas_id_dict = {}# 获取省idprovince_payload = {'请求的参数': '请求的参数',}province_response = requests.get(url,params=province_payload,headers=headers)province_json = province_response.json()province_response.close()# print(province_json)try:id_lists = province_json['rows']for id_dict in id_lists:province_name = id_dict['name']province_id_dict = {'id': str(id_dict['id']),'level': str(id_dict['level']),}all_areas_id_dict[province_name] = province_id_dictexcept Exception as e:data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'print('区域id结果已发生改变,请联系开发人员')print(e)return data_dict# 获取市idfor province_name in all_areas_id_dict:time.sleep(random.uniform(0,2.2))city_payload = {'请求的参数': '请求的参数',}city_response = requests.get(url, params=city_payload, headers=headers)city_json = city_response.json()city_response.close()# print(city_json)try:id_lists = city_json['rows']for id_dict in id_lists:city_name = id_dict['name']city_id_dict = {'id': str(id_dict['id']),'level': str(id_dict['level']),'parentId':str(id_dict['parentId'])}all_areas_id_dict[province_name][city_name] = city_id_dictexcept Exception as e:data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'print('区域id结果已发生改变,请联系开发人员')print(e)return data_dict# 获取区idfor province_name in all_areas_id_dict:city_list = list(all_areas_id_dict[province_name].keys())for city_name in city_list:if city_name == "id" or city_name == "level":continuetime.sleep(random.uniform(1, 3))area_payload = {'请求的参数': '请求的参数',}area_response = requests.get(url, params=area_payload, headers=headers)area_json = area_response.json()area_response.close()print(area_json)try:id_lists = area_json['rows']for id_dict in id_lists:area_name = id_dict['name']area_id_dict = {'id': str(id_dict['id']),'level': str(id_dict['level']),'parentId':str(id_dict['parentId'])}all_areas_id_dict[province_name][city_name][area_name] = area_id_dictexcept Exception as e:data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'print('区域id结果已发生改变,请联系开发人员')print(e)return data_dictdata_dict['data] = all_areas_id_dictprint(all_areas_id_dict)return data_dicttest_sku_id = 'xxxxxx'HEADERS = {'Cookie': 'xxxxxx',
}
get_areas_obj = GetAreasId(test_sku_id)
get_areas_result_dict = get_areas_obj.get_areas_id()
if get_areas_result_dict ['error_info'] != "无":print('获取失败')
else:all_areas_id_dict = get_areas_result_dict['data']
生成对应的表
data = "上面生成的字典" # all_areas_id_dict
id_lists = []
for province_name in data:city_list = list(data[province_name].keys())for city_name in city_list:if city_name == "id" or city_name == "level" or city_name == 'parentId':continuearea_list = list(data[province_name][city_name].keys())for area_name in area_list:if area_name == "id" or area_name == "level" or area_name == 'parentId':continueid_lists.append([province_name,city_name,area_name])print(id_lists)df = pd.DataFrame(id_lists, columns=['省', '市', '区'])file_name = 'xxxx地区全表.xlsx'
df.to_excel(file_name, index=False)
print(f"Excel文件已生成: {file_name}")
效果如下