python爬虫(二) 之 42号网汽车文章爬虫
今天在咸鱼上有个买家找我一个42号网汽车文章的爬虫,目前需求已经做完了,现在将这部分代码开源,供大家参考。爬虫能够抓取到网站上所有文章的数据,大概一小时左右就能将这个网站上的数据吃干抹尽。
import requests
import json
import csv
from lxml import etree
import timeclass How42:def __init__(self):self.article_list_pre_url = "https://api.42how.com/article?page="self.article_list_post_url = "&pageSize=10&orderBy=createTime&order=DESC&isProfessional=true&userType=0"self.start_page = 1self.end_page = 1000self.payload = {}self.article_list_headers = {'authority': 'api.42how.com','accept': 'application/json, text/plain, */*','accept-language': 'zh-CN,zh;q=0.9','cache-control': 'no-cache','cookie': '_ga_6GM2YNVSMY=GS1.1.1710298637.1.0.1710298637.60.0.0; _ga=GA1.1.383334843.1710298637','origin': 'https://www.42how.com','pragma': 'no-cache','referer': 'https://www.42how.com/','sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-site','source-type': '42web','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}self.article_detail_headers = {'authority': 'www.42how.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','accept-language': 'zh-CN,zh;q=0.9','cache-control': 'no-cache','cookie': 'i18n_redirected=zh; _ga=GA1.1.383334843.1710298637; _ga_6GM2YNVSMY=GS1.1.1710302704.2.0.1710302704.60.0.0','pragma': 'no-cache','referer': 'https://www.42how.com/?l=article','sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'navigate','sec-fetch-site': 'same-origin','upgrade-insecure-requests': '1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}def get_request(self, url, headers):response = requests.request("GET", url, headers=headers, data=self.payload)return response.textdef do_work(self):with open('42号.csv', 'w', newline='', encoding='utf-8-sig') as file:writer = csv.writer(file)csv_title = ["标题", "作者", "发布时间", "原文地址", "正文"]writer.writerow(csv_title)for current_page in range(self.start_page, self.end_page):print("================> 当前第" + str(current_page) + "页,共" + str(self.end_page) + "页 ============")article_list_url = self.article_list_pre_url + str(current_page)text = self.get_request(article_list_url, headers=self.article_list_headers)data = json.loads(text)["data"]self.write_page(writer, data)def write_page(self, writer, data):for item in data:# print(item["title"])# print(item["author"]["username"])# print(item["created_at"])# 获取文章详情内容# https://www.xchuxing.com/article/116378article_url = "https://www.42how.com/article/" + str(item["id"])text = self.get_request(article_url, headers=self.article_detail_headers)html = etree.HTML(text)# //*[@id="nice"]/div/div[1]result = html.xpath('normalize-space(//*[@id="nice"]/div/div[1])')print(result)# print(result)# time_struct = time.localtime(item["created_at"])# date = time.strftime("%Y-%m-%d %H:%M:%S", time_struct)row = [item["title"], item["author"]["nickname"], article_url, item["createTime"], result]writer.writerow(row)print("===========> 当前文章 " + article_url + " 写入完毕", )if __name__ == '__main__':how42 = How42()how42.do_work()
下面是程序的运行结果,最终抓取的数据放在同级目录下的42号.csv
文件。