whisper fastapi 完整识别一个音频文件实现

import whisper
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import StreamingResponse
import io
import torch
import numpy as np
from pydantic import BaseModel
import os
import tempfile# 加载 Whisper 模型
model = whisper.load_model("medium")# FastAPI 应用
app = FastAPI()def pad_or_trim(array, length: int = 16000 * 30, *, axis: int = -1):"""Split the audio array into multiple segments of length N_SAMPLES if it exceeds N_SAMPLES.If it is shorter, pad the array to N_SAMPLES."""if torch.is_tensor(array):arrays = []for i in range(0, array.shape[axis], length):segment = array.index_select(dim=axis, index=torch.arange(i, min(i + length, array.shape[axis]), device=array.device))if segment.shape[axis] < length:pad_widths = [(0, 0)] * segment.ndimpad_widths[axis] = (0, length - segment.shape[axis])segment = torch.nn.functional.pad(segment, [pad for sizes in pad_widths[::-1] for pad in sizes])arrays.append(segment)array = torch.stack(arrays)else:arrays = []for i in range(0, array.shape[axis], length):segment = array.take(indices=range(i, min(i + length, array.shape[axis])), axis=axis)if segment.shape[axis] < length:pad_widths = [(0, 0)] * segment.ndimpad_widths[axis] = (0, length - segment.shape[axis])segment = np.pad(segment, pad_widths)arrays.append(segment)array = np.stack(arrays)return array@app.post("/transcribe/")
async def transcribe_audio(file: UploadFile = File(...)):# 将上传的文件保存到临时文件with tempfile.NamedTemporaryFile(delete=False) as tmp:tmp.write(file.file.read())tmp_path = tmp.nameaudio = whisper.load_audio(tmp_path)# Pad or trim the audio to 30 secondsaudio = pad_or_trim(audio)async def generate():# Process each audio segment# for segment in audio:# Compute the log-Mel spectrogram and move to the same device as the modelmel = whisper.log_mel_spectrogram(audio).to(model.device)# Detect the spoken language_, probs = model.detect_language(mel)for probs in probs:detected_language = max(probs, key=probs.get)yield f"data: Detected language: {detected_language}\n\n"# Decode the audiooptions = whisper.DecodingOptions()result = whisper.decode(model, mel, options)# Yield the transcribed textfor r in result:yield f"data: {r.text}\n\n"# 删除临时文件os.remove(tmp_path)return StreamingResponse(generate(), media_type="text/event-stream")# 运行 FastAPI 应用
if __name__ == "__main__":import uvicornuvicorn.run(app, host="0.0.0.0", port=6006)

测试方法

1. 使用 `curl`

curl -X POST "http://127.0.0.1:8000/transcribe/" -H "accept: text/event-stream" -H "Content-Type: multipart/form-data" -F "file=@path_to_your_audio_file"

2. 使用 `requests` Python 脚本

import requestsurl = "http://localhost:6006/transcribe/"
file_path = "瓦解－－史塔克.mp3"  # 替换为你的实际音频文件路径with open(file_path, "rb") as f:files = {"file": f}response = requests.post(url, files=files, headers={"accept": "text/event-stream"})print(response)for line in response.iter_lines():if line:print(line.decode("utf-8"))

3. 使用 Postman

打开 Postman，创建一个新的 POST 请求。
输入 URL：http://127.0.0.1:8000/transcribe/
设置请求头：
- accept：text/event-stream
- Content-Type：multipart/form-data
在 Body 选项卡中选择 form-data，添加一个新的键值对：
- Key：file
- Type：File
- Value：选择你的音频文件
发送请求，并查看服务器返回的流数据。