一. 内容简介
python爬取bilibili,下载视频
二. 软件环境
2.1vsCode
2.2Anaconda
version: conda 22.9.0
2.3代码
链接:https://pan.baidu.com/s/1WuXTso_iltLlnrLffi1kYQ?pwd=1234
三.主要流程
3.1 下载单个视频
代码
import requests
import os
from lxml import etree
import redef videoDownload1(url_):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data) # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!' # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{title_}" # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{title_}/{title_}.mp4 -loglevel quiet'os.system(command)# 显示合成文件的大小print(f'{title_} 下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')
3.2 下载选集视频
选集视频的播放链接很好找,就是后面的p=几啥的,拼一下就可以拿到整个的播放链接了
代码
import requests
import os
from lxml import etree
import re# 获取网页源码
def getUrls2(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}response_ = requests.get(url, headers=headers)str_data = response_.text # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data) # 转换格式类型urls = []# 获取了li的数量,lis = html_obj.xpath("//ul[@class='list-box']/li")question_mark_index = url.find('?')# 如果找到了 '?',就截取该位置之前的子串if question_mark_index != -1:cleaned_url = url[:question_mark_index]else:cleaned_url = url# print(cleaned_url)# 拼接apifor i in range(1,len(lis)+1):# print(i)strs = cleaned_url + "?p=" + str(i)urls.append(strs)# print(content)return urls
import requests
import os
from lxml import etree
import redef videoDownload3(url_,i,name):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data) # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]fileName = name# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')title_ = title_.replace('-', '')title_ = title_.replace('—', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!' # 名称进行修改,避免重名title_1 = title_1.replace(':', '')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{fileName}" # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_1}.mp4 -loglevel quiet'file_path = f"./video/{fileName}/{i}.{title_}.mp4"if os.path.exists(file_path):passelse:os.system(command)# 显示合成文件的大小print(f'{i}.{title_} 下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')
3.3 下载合集视频
合集的里面数据的访问api
合集里面的数据,就是从这个里面拿到播放id,给json中的处理拿出来,拼接视频播放链接
代码
# 获取网页源码
def getUrls3(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 使用正则表达式提取数字pattern = r'\d+'numbers = re.findall(pattern, url)mid = numbers[0]season_id = numbers[1]page_num = 1url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()# print(json_data["data"]["page"]["total"])total = int(json_data["data"]["page"]["total"])page_size = int(json_data["data"]["page"]["page_size"])page = int(total / page_size) + 1name = json_data["data"]["meta"]["name"]# print(total,page)urls = []# for i in range(1,page+1):# print(i) url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()archives = json_data["data"]["archives"]num = 0for j in archives:bvid = archives[num]["bvid"]videoUrl = f"https://www.bilibili.com/video/{bvid}/"num = num + 1urls.append(videoUrl)return urls,name
import requests
import os
from lxml import etree
import redef videoDownload2(url_,i):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data) # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]fileName = html_obj.xpath('//h1[@class="video-title"]/text()')[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!' # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{fileName}" # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_}.mp4 -loglevel quiet'file_path = f"./video/{fileName}/{i}.{title_}.mp4"if os.path.exists(file_path):passelse:os.system(command)# 显示合成文件的大小print(f'{i}.{title_} 下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')
3.4 多线程
代码
import concurrent.futures
import requests# 定义一个下载函数
def download_video(URL):url, index, name = URL.split(" ", 2)videoDownload3(url,index,name)def THREAD(URLS):# 创建线程池,指定线程数量max_workers = 10 # 这里设置线程数量,根据需要进行调整with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:# 提交下载任务给线程池处理futures = [executor.submit(download_video, URL) for URL in URLS]# 等待所有任务完成for future in concurrent.futures.as_completed(futures):try:future.result() # 获取任务的结果(这里不需要结果)except Exception as e:print(f"An error occurred: {e}")
3.5 结果
url_model = "https://space.bilibili.com/471303350/channel/collectiondetail?sid=1278346 3"
value = url_model.split(' ')
url = value[0]
model = value[1]if model == "1":videoDownload1(url)print("下载完成")
if model == "2":# 接口分析# 点进去的话接口# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click# 点击视频的话就这样# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123# 可以发现不同的集的接口格式应该是这样的,p = 几就是第几集# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# 查看有多少集# 一种是视频选集那块会写有多少个# 获取源码urls = getUrls2(url)i = 1for index,url in enumerate(urls):videoDownload2(url,index)print("下载完成")
if model == "3":# 接口分析# 视频合计每个视频接口没有规律,然后再播放页中网页没有直接的播放链接,所以就用合集页的链接来分析# 网页里面的每个链接都是动态加载的,需要访问json数据获取,也或者用虚拟浏览器那种等页面加载完成后访问(这种以后可能会更新,感觉这个有点麻烦),# 这里是用json数据做的# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30# 这是两个接口,前面那个数字是用户,后面那个数字代表的是合集,下载的接口其实是股东urls,name = getUrls3(url)# print(len(urls))for index,url in enumerate(urls):# print(url)videoDownload3(url,index,name)# print(urls)# 多线程# for index,url in enumerate(urls):# URLS.append(url + " " + str(index) + " " + name)# THREAD(URLS)
那切里做展示,有些合集下载时候有点bug,还没找到问题,可以下载,但是保存路径有点问题,应该是和命令行冲突了,我就不改了
3.6 合集视频更新
原来会出现部分合集显示下载成功,但是文件夹里面没有东西,是因为有些合集名字在命令里面没办法执行,因为一些特殊符号什么的,所以把合集名字手动指定一下下载就可以了,然后多线程加上去,代码如下
拿视频链接的
# 获取网页源码
def getUrls3(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 使用正则表达式提取数字pattern = r'\d+'numbers = re.findall(pattern, url)mid = numbers[0]season_id = numbers[1]page_num = 1url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()# print(json_data["data"]["page"]["total"])total = int(json_data["data"]["page"]["total"])page_size = int(json_data["data"]["page"]["page_size"])page = int(total / page_size) + 1name = json_data["data"]["meta"]["name"]# print(total,page)urls = []# for i in range(1,page+1):# print(i) url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()archives = json_data["data"]["archives"]num = 0for j in archives:bvid = archives[num]["bvid"]videoUrl = f"https://www.bilibili.com/video/{bvid}/"num = num + 1urls.append(videoUrl)return urls,name
下载视频的
import requests
import os
from lxml import etree
import redef videoDownload3(url_,index,name):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data) # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!' # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{name}" # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{name}/{index}.{title_}.mp4 -loglevel quiet'os.system(command)# 显示合成文件的大小print(f'{title_} 下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')
多线程
import concurrent.futures
import requests# 定义一个下载函数
def download_video(URL):url, index, name = URL.split(" ", 2)videoDownload3(url,index,name)def THREAD(URLS):# 创建线程池,指定线程数量max_workers = 10 # 这里设置线程数量,根据需要进行调整with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:# 提交下载任务给线程池处理futures = [executor.submit(download_video, URL) for URL in URLS]# 等待所有任务完成for future in concurrent.futures.as_completed(futures):try:future.result() # 获取任务的结果(这里不需要结果)except Exception as e:print(f"An error occurred: {e}")
执行
url_model = "https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285 3"
value = url_model.split(' ')
url = value[0]
model = value[1]if model == "1":videoDownload1(url)print("下载完成")
if model == "2":# 接口分析# 点进去的话接口# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click# 点击视频的话就这样# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123# 可以发现不同的集的接口格式应该是这样的,p = 几就是第几集# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# 查看有多少集# 一种是视频选集那块会写有多少个# 获取源码urls = getUrls2(url)i = 1for index,url in enumerate(urls):videoDownload2(url,index)print("下载完成")
if model == "3":# 接口分析# 视频合计每个视频接口没有规律,然后再播放页中网页没有直接的播放链接,所以就用合集页的链接来分析# 网页里面的每个链接都是动态加载的,需要访问json数据获取,也或者用虚拟浏览器那种等页面加载完成后访问(这种以后可能会更新,感觉这个有点麻烦),# 这里是用json数据做的# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30# 这是两个接口,前面那个数字是用户,后面那个数字代表的是合集,下载的接口其实是股东urls,name = getUrls3(url)name = "qml项目"URLS = []# print(len(urls))for index,url in enumerate(urls):# print(url)URLS.append(url + " " + str(index+1) + " " + name)THREAD(URLS)print("全部下载完成!!!")# print(urls)# for index,url in enumerate(urls):# URLS.append(url + " " + str(index) + " " + name)# THREAD(URLS)
四.参考
http://t.csdn.cn/6Pt7v 想下载B站视频却不知如何下手?一文教你爬B站!