"""
====================================================================项目名称: 唯品会商品数据爬取
项目描述: 通过requests框架获取网页数据
项目环境: pycharm && python3.8
作者所属: 几许====================================================================一 . 抓包1. 对唯品会官网进行分析 -- 通过筛选直接搜索商品信息获得商品数据包https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v22. 对商品数据包进行分析 -- 一页数据分为三部分{50,50,20},不同商品含有一个产品信息的编码3. 获取本页所有产品信息的编码 -- 对编码进行检索 , 获得数据包https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank二 . 代码实现1. 定义产品信息数据请求函数得到数据包 -- 一次请求只能得到120条数据2. 通过for循环进行翻页处理得到全部信息3. 解析获得所需要的数据 , 通过表格保存====================================================================""" import requests
from jsonpath import jsonpath
import re, json
from pprint import pprint
import time
import csvclass Spider ( ) : def __init__ ( self) : self. headers = { 'authority' : 'mapi.vip.com' , 'accept' : '*/*' , 'accept-language' : 'zh-CN,zh;q=0.9' , 'cookie' : 'vip_cps_cuid=CU1703946155093f91b22c68d55b7591; vip_cps_cid=1703946155095_f8aebf721aa4d69f55487762e3ca4c21; cps_share=cps_share; PAPVisitorId=58460c7a22e31f6b4acb2a1ed741f921; vip_new_old_user=1; vip_city_name=%E5%B9%BF%E5%B7%9E%E5%B8%82; mars_cid=1703946163504_4eebec221de3364e0da3bbe4a2182454; VipUINFO=luc%3Aa%7Csuc%3Aa%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A0%7Cul%3A3105; mars_pid=0; vip_address=%257B%2522pname%2522%253A%2522%255Cu5b89%255Cu5fbd%255Cu7701%2522%252C%2522pid%2522%253A%2522103104%2522%252C%2522cname%2522%253A%2522%255Cu5e7f%255Cu5dde%255Cu5e02%2522%252C%2522cid%2522%253A%2522103104114%2522%257D; vip_province=103104; vip_province_name=%E5%AE%89%E5%BE%BD%E7%9C%81; vip_city_code=103104114; vip_wh=VIP_HZ; vip_ipver=31; user_class=a; cps=adp%3Antq8exyc%3A%40_%401704022867205%3Amig_code%3A4f6b50bf15bfa39639d85f5f1e15b10f%3Aac014miuvl0000b5sq8crnthcjdwurb0; mars_sid=ef0bd4aed17dd0eb261cda2a1a73e9d8; visit_id=43601963E9569AF31FBF7F1561D38FC9; vipshop_passport_src=https%3A%2F%2Fcategory.vip.com%2Fsuggest.php%3Fkeyword%3D%25E5%258F%25A3%25E7%25BA%25A2%26ff%3D235%7C12%7C1%7C1; vip_tracker_source_from=; vip_access_times=%7B%22list%22%3A0%2C%22detail%22%3A2%7D; pg_session_no=6; VipDFT=1' , 'referer' : 'https://category.vip.com/' , 'sec-ch-ua' : '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"' , 'sec-ch-ua-mobile' : '?0' , 'sec-ch-ua-platform' : '"Windows"' , 'sec-fetch-dest' : 'script' , 'sec-fetch-mode' : 'no-cors' , 'sec-fetch-site' : 'same-site' , 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } self. home_url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2' self. num = 1 def praseHomedata ( self, product) : try : self. sub_data = { 'callback' : ' getMerchandiseDroplets1' , 'app_name' : ' shop_pc' , 'app_version' : ' 4.0' , 'warehouse' : ' VIP_HZ' , 'fdc_area_id' : ' 103104114' , 'client' : ' pc' , 'mobile_platform' : ' 1' , 'province_id' : ' 103104' , 'api_key' : ' 70f71280d5d547b2a7bb370a529aeea1' , 'user_id' : ' ' , 'mars_cid' : ' 1703946163504_4eebec221de3364e0da3bbe4a2182454' , 'wap_consumer' : ' a' , 'productIds' : product, 'scene' : ' search' , 'standby_id' : ' nature' , 'extParams' : ' {"stdSizeVids":"","preheatTipsVer":"3","couponVer":"v2","exclusivePrice":"1","iconSpec":"2x","ic2label":1,"superHot":1,"bigBrand":"1"}' , 'context' : ' ' , '_' : ' 1704027272052' , } time. sleep( 1 ) response = requests. get( self. home_url, headers= self. headers, params= self. sub_data) . textjson_data = response. split( 'getMerchandiseDroplets1(' ) [ 1 ] data = re. sub( '\)' , '' , json_data) json_msg = json. loads( data) for i in json_msg[ 'data' ] [ 'products' ] : attrs = jsonpath( i, '$..value' ) [ 0 ] dict = { "标题" : i[ 'title' ] , "品牌" : i[ 'brandShowName' ] , "原价" : i[ 'price' ] [ 'marketPrice' ] , "折扣" : i[ 'price' ] [ 'saleDiscount' ] , "现价" : i[ 'price' ] [ 'salePrice' ] , "属性" : attrs} pprint( dict ) print ( '===========================================' ) csv_write. writerow( dict ) except : print ( "爬取完毕" ) def praseSubpage ( self) : for page in range ( 0 , 2400 , 120 ) : print ( f"正在爬取第 { self. num} 页" ) sub_url = f"https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank?callback=getM" \f"erchandiseIds&app_name=shop_pc&app_version=4.0&warehouse=VIP_HZ&fdc_area_id=10310411" \f"4&client=pc&mobile_platform=1&province_id=103104&api_key=70f71280d5d547b2a7bb370a529a" \f"eea1&user_id=&mars_cid=1703946163504_4eebec221de3364e0da3bbe4a2182454&wap_consumer=a&" \f"standby_id=nature&keyword=%E5%8F%A3%E7%BA%A2&lv3CatIds=&lv2CatIds=&lv1CatIds=&brandSt" \f"oreSns=&props=&priceMin=&priceMax=&vipService=&sort=0&pageOffset= { page} &channelId=1&g" \f"Platform=PC&batchSize=120&_=1704022901946" response = requests. get( sub_url, headers= self. headers) . textdata = re. findall( '{"pid":"(.*?)"}' , response) product1 = ',' . join( data[ 0 : 50 ] ) product2 = ',' . join( data[ 50 : 100 ] ) product3 = ',' . join( data[ 100 : 120 ] ) self. praseHomedata( product1) self. praseHomedata( product2) self. praseHomedata( product3) self. num += 1 if __name__ == '__main__' : with open ( '唯品会口红数据.csv' , 'a' , encoding= 'utf-8' , newline= '' ) as f: csv_write = csv. DictWriter( f, fieldnames= [ "标题" , "品牌" , "原价" , "折扣" , "现价" , "属性" ] ) csv_write. writeheader( ) spider = Spider( ) spider. praseSubpage( )