from openpyxl import load_workbook, Workbook
from openpyxl. drawing. image import Image
from openpyxl. styles import Alignment
from PIL import Image as I
from concurrent. futures import ThreadPoolExecutor
import requests
from lxml import etree
import os
import csv
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
} def GetUrl ( url, div_num= 6 ) : response = requests. get( url= url, headers= headers) response. encoding= 'gb2312' tree = etree. HTML( response. text) lis = tree. xpath( f'/html/body/div[ { div_num} ]/ul/li' ) for li in range ( 1 , len ( lis) + 1 ) : url_a = "https://www.4399.com" + tree. xpath( f'/html/body/div[ { div_num} ]/ul/li[ { li} ]/a/@href' ) [ 0 ] every_urls. append( url_a) def Get_Data ( url) : f = open ( "./game_data.csv" , "a" , newline= "" , encoding= "utf-8" ) w = csv. writer( f) response = requests. get( url= url, headers= headers) response. encoding = 'gb2312' if response. status_code != 200 : return tree = etree. HTML( response. text) a = 'https://www.4399.com' + tree. xpath( '/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/@href' ) [ 0 ] title = tree. xpath( '/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/text()' ) [ 0 ] font = tree. xpath( '/html/body/div[7]/div[1]/div[1]/div[2]/div[4]/div/font/text()' ) [ 0 ] w. writerow( [ title, a, font] ) img = 'https:' + tree. xpath( '/html/body/div[7]/div[1]/div[1]/div[1]/div[1]/a/img/@src' ) [ 0 ] type = img. split( "." ) [ - 1 ] response = requests. get( url= img, headers= headers) . contentwith open ( f"./游戏图片/ { title} . { type } " , "wb" ) as w: w. write( response) test. append( "1" ) def Save ( ) : f = open ( "game_data.csv" , "r" , encoding= "utf-8" , errors= 'ignore' ) datas = list ( csv. reader( f) ) length = len ( datas) wb = Workbook( ) sheet = wb. activealignment = Alignment( horizontal= 'center' , vertical= 'center' ) for num in range ( 1 , length+ 1 ) : sheet. row_dimensions[ num] . height= 75.5 sheet. column_dimensions[ 'A' ] . width= 20 try : tp = I. open ( f'./游戏图片/ { datas[ num- 1 ] [ 0 ] } .jpg' ) w, h = tp. sizenp = tp. resize( ( w// 2 , h// 2 ) ) np. save( f'./图片缓存/ { datas[ num- 1 ] [ 0 ] } .jpg' ) image_path = f'./图片缓存/ { datas[ num- 1 ] [ 0 ] } .jpg' except : image_path = f'./无.jpg' try : img = Image( image_path) sheet. add_image( img, f'a { num} ' ) sheet. column_dimensions[ 'B' ] . width = 22 sheet[ f'B { num} ' ] = datas[ num- 1 ] [ 0 ] sheet[ f'B { num} ' ] . alignment = alignmentsheet. column_dimensions[ 'C' ] . width = 38.18 sheet[ f'C { num} ' ] = datas[ num- 1 ] [ 1 ] sheet[ f'C { num} ' ] . alignment = alignmentsheet[ f'D { num} ' ] = datas[ num- 1 ] [ 2 ] sheet[ f'D { num} ' ] . alignment = alignmentexcept : pass wb. save( 'game.xlsx' ) if __name__ == '__main__' : test = [ ] if not os. path. exists( './图片缓存' ) : os. mkdir( './图片缓存' ) print ( "文件已创建" ) f = open ( "./game_data.csv" , "w" , encoding= "utf-8" , newline= "" ) every_urls = [ ] urls = [ ] for i in range ( 2 , 11 ) : urls. append( f'https://www.4399.com/flash/new_ { i} .htm' ) GetUrl( "https://www.4399.com/flash/new.htm" , div_num= 8 ) with ThreadPoolExecutor( max_workers= 10 ) as e: for url in urls: e. submit( GetUrl, url) print ( "多线程爬取到的所有url链接···" ) with ThreadPoolExecutor( max_workers= 100 ) as e: for url in every_urls: e. submit( Get_Data, url) print ( "链接总共的条数有:" , len ( test) ) f. close( ) print ( "等待图片跟数据的保存···" ) Save( ) print ( "数据已获取保存" )