语音识别教程:Whisper
一、前言
最近看国外教学视频的需求,有些不是很适应,找了找AI字幕效果也不是很好,遂打算基于Whisper和GPT做一个AI字幕给自己。
二、具体步骤
1、安装FFmpeg
Windows:
-
进入 https://github.com/BtbN/FFmpeg-Builds/releases,点击 windows版本的FFMPEG对应的图标,进入下载界面点击 download 下载按钮。
-
解压下载好的zip文件到指定目录(放到你喜欢的位置)
-
将解压后的文件目录中 bin 目录(包含 ffmpeg.exe )添加进 path 环境变量中
-
DOS 命令行输入 ffmpeg -version, 出现以下界面说明安装完成:
2、安装Whisper模型
运行以下程序,会自动安装Whisper-small的模型,并识别音频audio.mp3 输出识别到的文本。(如果没有科学上网的手段请手动下载)
import whisper
model = whisper.load_model("small")
result = model.transcribe("audio.mp3")
print(result["text"])
运行结果如下
三、其他
实时录制音频并转录
import pyaudio
import wave
import numpy as np
from pydub import AudioSegment
from audioHandle import addAudio_volume,calculate_volume
from faster_whisper import WhisperModelmodel_size = "large-v3"# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")def GetIndex():p = pyaudio.PyAudio()# 要找查的设备名称中的关键字target = '立体声混音'for i in range(p.get_device_count()):devInfo = p.get_device_info_by_index(i)# if devInfo['hostApi'] == 0:if devInfo['name'].find(target) >= 0 and devInfo['hostApi'] == 0:print(devInfo)print(devInfo['index'])return devInfo['index']return -1
# 配置
FORMAT = pyaudio.paInt16 # 数据格式
CHANNELS = 1 # 声道数
RATE = 16000 # 采样率
CHUNK = 1024 # 数据块大小
RECORD_SECONDS = 5 # 录制时长
WAVE_OUTPUT_FILENAME = "output3.wav" # 输出文件
DEVICE_INDEX = GetIndex() # 设备索引,请根据您的系统声音设备进行替换
if DEVICE_INDEX==-1:print('请打开立体声混音')
audio = pyaudio.PyAudio()# 开始录制
stream = audio.open(format=FORMAT, channels=CHANNELS,rate=RATE, input=True,frames_per_buffer=CHUNK, input_device_index=DEVICE_INDEX)
data = stream.read(CHUNK)
print("recording...")frames = []moreDatas=[]
maxcount=3
count=0
while True:# 初始化一个空的缓冲区datas = []for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):data = stream.read(CHUNK)audio_data = np.frombuffer(data, dtype=np.int16)datas.append(data)# 计算音频的平均绝对值volume = np.mean(np.abs(audio_data))# 将音量级别打印出来print("音量级别:", volume)moreDatas.append(datas)if len(moreDatas)>maxcount:moreDatas.pop(0)newDatas=[i for j in moreDatas for i in j]buffers=b''for buffer in newDatas:buffers+=bufferprint('开始识别')buffers=np.frombuffer(buffers, dtype=np.int16)# a = np.ndarray(buffer=np.array(datas), dtype=np.int16, shape=(CHUNK,))segments, info = model.transcribe(np.array(buffers), language="en")text=''for segment in segments:print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))text+=segment.textprint(text)
print("finished recording")# 停止录制
stream.stop_stream()
stream.close()
audio.terminate()# 保存录音
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()#addAudio_volume(WAVE_OUTPUT_FILENAME)