1.提示:
使用需要安装各种import的包,都是很基础的包,直接安装即可。
自备梯子 。
2.严肃警告
- 本程序仅可用作个人爱好,商业用途严禁!
- 请自觉遵守君子协定robots.txt
- 不要给网址过大的压力,每天以及同一时段的访问量请控制在一定程度内!
3.思路:
天天有群友找我要涩图,库存根本不够哇,得想办法弄点。
pixiv的爬取有很多大佬做过了,不过我看了一些都是弄得类似于项目一样,确实都很厉害,但我的需求简单,写在一个文件里适合我这种懒蛋。
- 首先通过
RankingCrawler
类的get_multi_page_json
方法,获取榜单的json数据,将id添加到collector
中。 - 然后通过
collecto
r的collect
方法,遍历每个id,用get_artworks_urls
方法获取id页面内的所有图片的urls,将url添加到downloader
中。 - 最后通过
downloader
的download
方法,多线程调用同class内的download_image
方法同时下载图片。
4.使用方法:
一般来说都是直接run就行,所有需要修改的参数都在RankingCrawler
类的init
中,看一眼就明白
5.代码如下:
import os
import re
import time
import requests
import concurrent.futures as futuresfrom datetime import datetime, timedelta
from typing import Set, Iterable, Callable, Dict, Optional, Tuple
from tqdm import tqdmclass Downloader():def __init__(self, capacity, headers, threads, standard_time):self.url_group: Set[str] = set()self.capacity = capacity # 最大下载量(MB)self.store_path = f"{datetime.now().strftime('%Y_%m_%d')}/" # 获取当日日期作为存储路径self.standard_time = standard_timeself.threads = threadsself.headers = headers.copy()# 添加urldef add(self, urls: Iterable[str]):for url in urls:self.url_group.add(url)# 下载单张图片def download_image(self, url: str) -> float:"""func: 1.根据url下载单张图片,返回图片大小(MB)return: 返回图片大小(MB)url example: "https://i.pximg.net/img-master/img/2024/02/10/03/09/52/115911580_p0_master1200.jpg""""# image_nameimage_name = url[url.rfind("/") + 1:]# image_idimage_id = re.search(r"/(\d+)_", url).group(1)# image_pathimage_path = self.store_path + image_name# 添加Refererself.headers.update({"Referer": f"https://www.pixiv.net/artworks/{image_id}"})# 确保存储路径存在os.makedirs(self.store_path, exist_ok=True)# 判断图片是否存在if os.path.exists(image_path):# print(f"File {image_name} already exists. Skipping download.")return 0# 下载图片, 尝试最多10次,因为是多线程下载,所以间隔时间可以稍微长一点点for i in range(10):try:response = requests.get(url, headers=self.headers, timeout=(4, self.standard_time)) # timeout(连接超时, 读取超时)if response.status_code == 200:if "content-length" not in response.headers: # 确保content-length在response.headers中,否则抛出异常raise "content-length not in response.headers"image_size = int(response.headers["content-length"])with open(image_path, "wb") as f:f.write(response.content)return image_size / (1 << 20)except Exception as e:passreturn 0# 多线程下载多张图片def download(self):# 提前封装download_image函数的固定参数,因为map函数只能传入一个参数flow_size = .0print("===== downloader start =====")with futures.ThreadPoolExecutor(self.threads) as executor:# tqdm为进度条with tqdm(total=len(self.url_group), desc="downloading") as pbar:# 多线程并发,通过futures的map方法,将url_group中的每个url传入download_image函数,并通过迭代器的方式返回每个图片的大小for image_size in executor.map(self.download_image, self.url_group):flow_size += image_sizepbar.update()pbar.set_description(f"downloading / flow {flow_size:.2f}MB")if flow_size > self.capacity:executor.shutdown(wait=False, cancel_futures=True)breakprint("===== downloader complete =====")return flow_sizeclass Collector():def __init__(self, threads, user_id, headers, downloader):self.id_group: Set[str] = set() # illust_idself.threads = threadsself.user_id = user_idself.headers = headers.copy()self.downloader = downloaderdef add(self, image_ids):self.id_group.add(image_ids)# 解析HTTP响应,提取并返回一个包含原始图像URLs的集合def select_page(self, response) -> Set[str]:"""url: https://www.pixiv.net/ajax/illust/xxxx/pages?lang=zhcollect all image urls from (page.json)Returns: Set[str]: urls"""group = set()for url in response.json()["body"]:group.add(url["urls"]["original"])return group# 对给定的URL执行HTTP GET请求,并使用指定的选择器函数处理响应数据def get_artworks_urls(self, args: Tuple[str, Callable, Optional[Dict]]) -> Optional[Iterable[str]]:# 拿到参数url, selector, additional_headers = args# 更新请求头headers = self.headersheaders.update(additional_headers)time.sleep(1)# 尝试抓取最多10次for i in range(10):try:response = requests.get(url, headers=headers, timeout=4)if response.status_code == 200:id_group = selector(response)return id_groupexcept Exception as e:print(e)time.sleep(1)# 并发地收集所有艺术作品的图像URLs,并将它们发送给下载器进行下载。def collect(self):"""collect all image ids in each artwork, and send to downloaderNOTE: an artwork may contain multiple images"""print("===== collector start =====")with futures.ThreadPoolExecutor(self.threads) as executor:with tqdm(total=len(self.id_group), desc="collecting urls") as pbar:# 生成每个illust_id对应的urlurls_list = [f"https://www.pixiv.net/ajax/illust/{illust_id}/pages?lang=zh" for illust_id in self.id_group]# 生成每个illust_id对应的请求头additional_headers = [{"Referer": f"https://www.pixiv.net/artworks/{illust_id}","x-user-id": self.user_id,}for illust_id in self.id_group]# 通过get_artworks_urls获取url下的所有图片urls,发送给downloader# futures.ThreadPoolExecutor(n_thread).map(func, iterable) 会将可迭代对象中的每个元素传入func中, 并将所有的func返回值组成一个迭代器返回for urls in executor.map(self.get_artworks_urls, zip(urls_list, [self.select_page] * len(urls_list), additional_headers)):if urls is not None:self.downloader.add(urls)pbar.update()print("===== collector complete =====")return self.id_groupclass RankingCrawler():def __init__(self):"""download artworks from ranking参数(*为可修改的):top_num: 排行榜前多少名*time_mode: 榜单时间(日, 周, 月...)*content: 内容(插画, 漫画, 动图...)*headers: 请求头*threads: 线程数*capacity: 最大流量容量(MB)*standard_time: 标准等待时间user_id: 自己的用户id*date: 当日日期-1天切记cookie和user_id要是同一个账号的"""self.top_num = 200self.time_mode = "weekly"self.content = "illust"self.headers = {"Cookie": "first_visit_datetime_pc=2022-08-02+15%3A36%3A28; p_ab_id=1; p_ab_id_2=7; p_ab_d_id=1023356603; yuid_b=QXQ4QA; privacy_policy_notification=0; a_type=1; b_type=0; d_type=1; login_ever=yes; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^6=user_id=56850222=1^9=p_ab_id=1=1^10=p_ab_id_2=7=1^11=lang=zh=1; _im_vid=01HM1N03159737XTWCJAJ2BY5P; __utmz=235335808.1707968192.3.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); _gid=GA1.2.1257313260.1707968194; QSI_S_ZN_5hF4My7Ad6VNNAi=v:0:0; cf_clearance=sC.C_Nz8.49ropcKnVtVHwfrTz0lUg9aYEOSImWP_10-1707984781-1.0-AWnSRIR6YUxoAY8SgMleN0cv3AEKUf1k5ZpisVJ8snzbHvjFS5eX2oWL8Cd2oh0X4fq33MNgfAG7t8d80uS6gmQ=; _gcl_au=1.1.652233640.1707984789; _im_uid.3929=b.427a030d239f8f0f; _pbjs_userid_consent_data=3524755945110770; _im_vid=01HM1N03159737XTWCJAJ2BY5P; _pubcid=72d87e02-9d02-4eba-b647-77676ef3673c; __utma=235335808.1308491601.1659422191.1707968192.1707986109.4; __cf_bm=yav.mgbX7Rs1IN42BVskLJ0y56PpJ.1vuWOV.KN9160-1708063973-1.0-AVZMWvXlgMpvT1KiNIA0E0QeHN5d61NRsSZhDIrrqdV+zJXgRgtEXp1QfmcXtt1nCQ727jXsSJuN3AXfbNO49rvj6Lj7/mYJzBkiPMVZpREY; cto_bundle=LnH3rF8lMkJwbmI0WnQlMkZoTjlEVlBkalNGaVJLMWR4JTJGN1JGeFpJM0VQcDVMVVJNU0g1ZkxqOEdlS2VIMkFDbmZrYlNGa21xUlAxWHdJZm1ZY2g1VmE3emhYUnNWbyUyQjFreFp1M1VKMTQ2Tm9Bc051VHpCTU51SzhzNFVNbEZjJTJCa3dlZFBQSDlIZ1JmMlNwSWpoR0FIdFY2Z2ZlMHBBJTNEJTNE; cto_bidid=GceTm19EUWVHTDZEMjdZVEVad0g2Skh0bDdtMVpCJTJGNiUyRjlBQ3N0JTJCeVZaeUU0dWIzemIzRlVsWjZVWnlDcFh4VnJVQ0xNRERtRCUyQnRDRmU0UkQlMkZOdERHUWJsMmNNT0d1bUZJSnBlb3ZtREY1eUYxWG8lM0Q; cto_dna_bundle=wcDUY185WlVTMUslMkYzb3lGS2J1ekVhb2pXaVgyQnF5ZkFyelBXZUIyMEMlMkYlMkZOWnZGbkhVaFBjT1pRcmhvdEdzTlhOcTdKc0JJTmIxZXduS2tsUXUlMkZPVVBVM1N3JTNEJTNE; cc1=2024-02-16%2015%3A33%3A21; PHPSESSID=103427506_aygRqdHukP6BzGOBKfQDw2QadPmoxwvm; device_token=29dcd2275dd3ad33f978780a8732d797; privacy_policy_agreement=6; _ga_MZ1NL4PHH0=GS1.1.1708065204.3.1.1708065473.0.0.0; c_type=45; _ga_75BBYNYN9J=GS1.1.1708063957.12.1.1708065479.0.0.0; _ga=GA1.2.1308491601.1659422191; _gat_UA-1830249-3=1","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"}self.threads = 12self.capacity = 2048self.standard_time = 10self.user_id = "103427506"self.date = (datetime.now()-timedelta(days=1)).strftime('%Y%m%d')self.downloader = Downloader(self.capacity, self.headers, self.threads, self.standard_time)self.collector = Collector(self.threads, self.user_id, self.headers, self.downloader)# 拿到榜单的json数据,将id添加到collector中def get_multi_page_json(self):for i in range(1, self.top_num // 50 + 1):url = f"https://www.pixiv.net/ranking.php?mode={self.time_mode}&content={self.content}&date={self.date}&p={i}&format=json"headers = self.headersheaders.update({"Referer": f"https://www.pixiv.net/ranking.php?mode={self.time_mode}&date={self.date}","x-requested-with": "XMLHttpRequest"})response = requests.get(url, headers=headers, timeout=(4, self.standard_time))if response.status_code == 200:art_works = response.json()["contents"]for i in art_works:self.collector.add(str(i["illust_id"]))time.sleep(1)def run(self):self.get_multi_page_json()self.collector.collect()self.downloader.download()if __name__ == "__main__":RankingCrawler().run()
喜欢的话不妨点个赞吧?有人互动的感觉才能支撑我继续发文章呀~