1. 需要的类包
import pandas as pd
import requests
2. 请求地址
通过分析,数据可以直接从接口获取,无需解析页面标签,直接取出我们需要的数据即可。
def fetch_hot_news(api_url):response = requests.get(api_url)if response.status_code == 200:data = response.json()hot_news = data.get("data", {}).get("hotNews", [])return hot_newselse:print(f"Failed to retrieve data. Status code: {response.status_code}")return []
3. 导出表格
def export_to_excel(hot_news_data):if not hot_news_data:return# Add the missing URL fieldbase_url = "https://www.xxx.cn/newsDetail_forward_" #澎某paihot_news_data = [{**news,"URL": f"{base_url}{news['contId']}"} for news in hot_news_data]# Create a DataFramedf = pd.DataFrame(hot_news_data)# Choose only relevant columnsrelevant_columns = ["contId", "name", "pubTime", "URL"]df = df[relevant_columns]# Export to Exceldf.to_excel("pengpai-top.xlsx", index=False)print("Data exported to Excel successfully.")
处理url
def main():api_url = "https:/xx/wwwIndex/xxx" #分析得到的需要请求的接口地址hot_news_data = fetch_hot_news(api_url)if hot_news_data:export_to_excel(hot_news_data)else:print("No hot news data found.")