xpath爬取4399的最新游戏系列

# 因为多线程获取数据保存在excel会乱，所以先保存在csv中以及图片文件，后面在结合from openpyxl import load_workbook,Workbook # Load 读取; Workbook 写入
from openpyxl.drawing.image import Image # excel 写入图片
from openpyxl.styles import Alignment # 居中处理
from PIL import Image as I # 修改图片尺寸
from concurrent.futures import ThreadPoolExecutor  # 多线程获取信息
import requests
from lxml import etree
import os
import csv# test one
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}def GetUrl(url,div_num=6):  # 第一页有大标题是8 , 后面是6# url = "https://www.4399.com/flash/new_2.htm" # test_demoresponse = requests.get(url=url, headers=headers)response.encoding='gb2312'tree = etree.HTML(response.text)lis = tree.xpath(f'/html/body/div[{div_num}]/ul/li')# print(len(lis)) # 每一个页面有2070条数据for li in range(1,len(lis)+1):url_a ="https://www.4399.com" + tree.xpath(f'/html/body/div[{div_num}]/ul/li[{li}]/a/@href')[0]# print(url_a)every_urls.append(url_a)def Get_Data(url):f = open("./game_data.csv","a",newline="",encoding="utf-8")w = csv.writer(f)response = requests.get(url=url, headers=headers)response.encoding = 'gb2312'if response.status_code !=200:returntree = etree.HTML(response.text)a = 'https://www.4399.com'+tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/@href')[0]title = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/text()')[0]font = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[4]/div/font/text()')[0]w.writerow([title,a,font]) # 写入到 csv文件中img = 'https:' + tree.xpath( '/html/body/div[7]/div[1]/div[1]/div[1]/div[1]/a/img/@src')[0]type = img.split(".")[-1]response = requests.get(url=img, headers=headers).content# 多线程去执行的时候,为了防止覆盖,所以用时间戳代表每一张图片with open(f"./游戏图片/{title}.{type}", "wb") as w:w.write(response)test.append("1")def Save():# 保存信息f = open("game_data.csv","r",encoding="utf-8",errors='ignore')datas = list(csv.reader(f))length = len(datas)# print(len(datas))wb = Workbook()sheet = wb.active# 提示居中alignment = Alignment(horizontal='center', vertical='center')# 设置高度 1是示例for num in range(1,length+1):sheet.row_dimensions[num].height=75.5sheet.column_dimensions['A'].width=20try:tp = I.open(f'./游戏图片/{datas[num-1][0]}.jpg')w,h = tp.sizenp = tp.resize((w//2,h//2))np.save(f'./图片缓存/{datas[num-1][0]}.jpg')image_path =f'./图片缓存/{datas[num-1][0]}.jpg'except:image_path = f'./无.jpg'try: # 防止索引超出 【无脑错误】img = Image(image_path)# img.anchor = sheet.cell(row=1,column=1).coordinatesheet.add_image(img,f'a{num}') # 第四个为止sheet.column_dimensions['B'].width = 22sheet[f'B{num}'] = datas[num-1][0]sheet[f'B{num}'].alignment = alignmentsheet.column_dimensions['C'].width = 38.18sheet[f'C{num}'] = datas[num-1][1]sheet[f'C{num}'].alignment = alignmentsheet[f'D{num}'] = datas[num-1][2]sheet[f'D{num}'].alignment = alignmentexcept:passwb.save('game.xlsx')if __name__ == '__main__':test = [] # 检测有多少条数据真正获取if not os.path.exists('./图片缓存'):os.mkdir('./图片缓存')print("文件已创建")#f = open("./game_data.csv","w",encoding="utf-8",newline="")every_urls = []# "https://www.4399.com/flash/new.htm" 第一页的数据urls = []for i in range(2, 11):urls.append(f'https://www.4399.com/flash/new_{i}.htm')# print('添加10页的链接', urls)GetUrl("https://www.4399.com/flash/new.htm",div_num=8) # 第一页的数据# print(urls) # 所有的链接# 多线程爬取多个 urlwith ThreadPoolExecutor(max_workers=10) as e:for url in urls:e.submit(GetUrl,url)print("多线程爬取到的所有url链接···")# print(every_urls)# print(len(every_urls))with ThreadPoolExecutor(max_workers=100) as e:for url in every_urls:e.submit(Get_Data,url)print("链接总共的条数有：",len(test))f.close()print("等待图片跟数据的保存···")# 将 data_csv 和 图片想结合Save()print("数据已获取保存")# test# Get_Data('https://www.4399.com/flash/240995.htm')