配置pip源
更新pip pip install --upgrade pip
下载清华源 pip config set global.index-url Simple Index
requests
用代码模拟向浏览器发送请求
#返回的是json格式,分页查询 import requests for i in range(0,100,10):res=requests.get(url="https://movie.douban.com/j/tv/recommend_groups",headers={"User-Agent":"Mozilla/05.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"})res.encoding="utf-8"print(res.text) #把jasn类型转换为字典import jsondata_dict=json.loads(res.text)for ele in data_dict['groups']:name=ele["name"]picture=ele['picture']url=ele['url']print(name,picture,'#######',url) print(data_dict)
#爬取豆瓣高分电影 import requests import json res=requests.get(url="https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=50&page_start=0",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"}) res.encoding="utf-8" data_dict=json.loads(res.text) for ele in data_dict['subjects']:title=ele["title"]url=ele['url']print(title,url) #肖申克的救赎 https://movie.douban.com/subject/1292052/ #爱乐之城 https://movie.douban.com/subject/25934014/ #万物生灵:2023圣诞特别集 https://movie.douban.com/subject/35729996/ #疯狂动物城 https://movie.douban.com/subject/25662329/ #我不是药神 https://movie.douban.com/subject/26752088/
bs4(解析html格式文件)
#pip install BeautifulSoup4 #爬取汽车之家的新闻,图片 #https://www.autohome.com.cn/news/ import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get(url="https://www.autohome.com.cn/news/",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"}) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="gb2312" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"article-wrapper"}) li_list_node=data.find_all(name='li') for i in li_list_node:aa=i.find(name="h3")if not aa:continue #.text获取文本print(aa.text) # 获取p标签内容p=i.find(name="p")print(p.text) #获取img标签内容,用sttrs获取字典,再获取['img']img=i.find(name="img")print(img.attrs['src'])
#练习 import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get(url="https://www.autohome.com.cn/news/",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"}) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="gb2312" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"editor-wrap"}) li_list_node=data.find_all(name='li') for i in li_list_node:aa=i.find(name="div",attrs={"class":"editorname"})bb=i.find(name="div",attrs={"class":"dept"})cc=i.find(name="div",attrs={"class":"position"})dd=i.find(name="a")["href"]print(aa.text,bb.text,cc.text,dd)
#获取商城商品价格,并将商品图片下载到本地 import requests import json #BeautifulSoup用于解析 HTML和 XML文档 from bs4 import BeautifulSoup res=requests.get(url="https://mall.10010.com/bj/",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"}) #汽车之家的编码格式为gb2312,其余一般为utf-8 res.encoding="utf-8" soup=BeautifulSoup(res.text,features="html.parser") #获取所有标签为div,属性为class:article-wrapper # findall找到所有 data=soup.find(name='div',attrs={"class":"mobileZone"}) li_list_node=data.find_all(name='li') for i in li_list_node:img_tags = i.find_all(name='img')aa = i.find("p")if aa is None:continuetitle = aa.textfor img_tag in img_tags:url = img_tag['src']res = requests.get(url=url)name1 = "{}.jpg".format(title)with open(name1, 'wb') as f:f.write(res.content) print(aa.text)