【azure openai】用tts实现语音对话【demo】

能实现：

只要替换里面的key，就能跑通。

key的查找方法：

【保姆级教程】如何在azure里快速找到openai的key和demo-CSDN博客

代码结构：
azure_openai_client.py

main.py

prompts_config.py

speech_utils.py

stt01.py

tts01.py

azure_openai_client.py

import os
import base64
from openai import AzureOpenAI
from typing import List, Dict, Optional, Unionclass AzureOpenAIClient:def __init__(self,endpoint: str = "替换成你的终结点",deployment: str = "模型名称",api_key: Optional[str] = None,system_prompt: str = None):"""初始化 Azure OpenAI 客户端Args:endpoint: Azure OpenAI 服务端点deployment: 部署名称api_key: API 密钥，如果为 None 则从环境变量获取system_prompt: 系统提示词，如果为 None 则使用默认提示"""self.endpoint = endpointself.deployment = deploymentself.api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY","替换为你的key")self.client = AzureOpenAI(azure_endpoint=self.endpoint,api_key=self.api_key,api_version="2024-05-01-preview")# 使用传入的系统提示词或默认提示词default_prompt_text = "你是一个帮助用户查找信息的 AI 助手。"if system_prompt:default_prompt_text = system_promptself.default_chat_prompt = [{"role": "system","content": [{"type": "text","text": default_prompt_text}]}]def encode_image(self, image_path: str) -> str:"""将图片编码为 base64 字符串Args:image_path: 图片路径Returns:base64 编码的图片字符串"""with open(image_path, 'rb') as image_file:return base64.b64encode(image_file.read()).decode('ascii')def chat_completion(self,messages: Optional[List[Dict]] = None,max_tokens: int = 200,temperature: float = 0.7,top_p: float = 0.95,frequency_penalty: float = 0,presence_penalty: float = 0,stop: Optional[Union[str, List[str]]] = None,stream: bool = False):"""生成聊天完成Args:messages: 聊天消息列表，如果为 None 则使用默认提示max_tokens: 生成的最大标记数temperature: 采样温度top_p: 核采样概率frequency_penalty: 频率惩罚presence_penalty: 存在惩罚stop: 停止序列stream: 是否使用流式响应Returns:聊天完成响应"""if messages is None:messages = self.default_chat_promptcompletion = self.client.chat.completions.create(model=self.deployment,messages=messages,max_tokens=max_tokens,temperature=temperature,top_p=top_p,frequency_penalty=frequency_penalty,presence_penalty=presence_penalty,stop=stop,stream=stream)return completion

main.py

from speech_utils import SpeechService, text_to_speech, speech_to_text
import time
import os
from azure_openai_client import AzureOpenAIClient
from prompts_config import get_system_promptdef main():# 创建 SpeechService 实例speech_service = SpeechService(speech_key="替换为你的key",service_region="资源部署地")while True:print("\n=== 功能菜单 ===")print("1. 语音转文字")print("2. 文字转语音")print("3. AI 语音对话")print("0. 退出")choice = input("请选择功能 (0-3): ")if choice == "0":print("感谢使用，再见！")breakelif choice == "1":print("\n=== 语音转文字 ===")print("支持的语言：中文、英语、日语")print("请说话...")# 记录开始时间start_time = time.time()success, result = speech_service.speech_to_text(languages=["zh-CN", "en-US", "ja-JP"])# 计算并显示耗时elapsed_time = time.time() - start_timeprint(f"\n语音识别耗时: {elapsed_time:.2f}秒")if success:print(f"识别结果: {result['text']}")if result['detected_language']:print(f"检测到的语言: {result['detected_language']}")if input("\n是否要将识别的文字转换为语音？(y/n): ").lower() == 'y':# 记录文字转语音开始时间tts_start_time = time.time()success, message = speech_service.text_to_speech(result['text'])# 计算并显示文字转语音耗时tts_elapsed_time = time.time() - tts_start_timeprint(f"文字转语音耗时: {tts_elapsed_time:.2f}秒")print(message)else:print(f"错误: {result}")elif choice == "2":print("\n=== 文字转语音 ===")print("可选择的语音：")print("1. 中文女声 (zh-CN-XiaoxiaoNeural)")print("2. 中文男声 (zh-CN-YunxiNeural)")print("3. 英文女声 (en-US-AriaNeural)")voice_choice = input("请选择语音 (1-3，默认1): ").strip()voice_map = {"1": "zh-CN-XiaoxiaoNeural","2": "zh-CN-YunxiNeural","3": "en-US-AriaNeural"}voice_name = voice_map.get(voice_choice, "zh-CN-XiaoxiaoNeural")text = input("\n请输入要转换为语音的文字: ")# 记录开始时间start_time = time.time()success, message = speech_service.text_to_speech(text, voice_name=voice_name)# 计算并显示耗时elapsed_time = time.time() - start_timeprint(f"文字转语音耗时: {elapsed_time:.2f}秒")print(message)elif choice == "3":voice_chat()else:print("\n无效的选择，请重试。")time.sleep(1)def voice_chat():# 初始化服务ai_client = AzureOpenAIClient()speech_service = SpeechService(speech_key=".....",service_region="资源位置。例：eastus")# 选择语言print("\n请选择对话语言：")print("1. 中文")print("2. English")lang_choice = input("请选择 (1/2): ")language = "zh-CN" if lang_choice == "1" else "en-US"# 获取为该语言配置的系统提示词system_prompt = get_system_prompt(language)# 创建AI客户端并设置系统提示ai_client = AzureOpenAIClient(system_prompt=system_prompt)# 更新系统提示messages = [{"role": "system","content": [{"type": "text", "text": system_prompt}]}]print("\n=== AI 语音对话开始 ===")print("输入 's' 开始对话，输入 'q' 结束对话")while True:command = input("\n请输入命令 (s: 开始对话, q: 退出): ")if command.lower() == 'q':breakelif command.lower() == 's':print("\n开始对话模式...")print("系统会在AI回复完成后才开始检测您的语音")print("说 '再见' 或 'goodbye' 结束对话")continue_dialog = Truewhile continue_dialog:# 提示用户说话print("\n请开始说话...")# 每次只进行一次语音识别success, result = speech_service.speech_to_text(languages=[language])if success and result['text']:user_text = result['text']print(f"\n您说: {user_text}")# 检查是否要结束对话if (language == "zh-CN" and "再见" in user_text.lower()) or \(language == "en-US" and "goodbye" in user_text.lower()):print("对话结束")continue_dialog = Falsebreak# 添加用户消息messages.append({"role": "user","content": [{"type": "text", "text": user_text}]})# 获取 AI 响应print("AI思考中...")response = ai_client.chat_completion(messages=messages)ai_text = response.choices[0].message.contentprint(f"AI 响应: {ai_text}")# 添加 AI 响应到消息历史messages.append({"role": "assistant","content": [{"type": "text", "text": ai_text}]})# 文字转语音 - 等待语音合成完成print("正在生成语音...")voice_name = "zh-CN-XiaoxiaoNeural" if language == "zh-CN" else "en-US-AriaNeural"success, message = speech_service.text_to_speech(ai_text, voice_name=voice_name)if not success:print(f"语音合成失败: {message}")print("AI语音播放完成，准备下一轮对话")else:print("未能识别您的语音，请重试")else:print("无效的命令，请重试")if __name__ == "__main__":main()

prompts_config.py

# 系统提示词配置# 主要系统提示词 - 使用一种语言编写（中文）
MAIN_SYSTEM_PROMPT = """
你是一个智能AI助手，专注于提供有用、准确的信息。请遵循以下准则:
1. 保持回答简洁明了，避免冗长解释
2. 使用礼貌友好的语气
3. 如果不确定答案，坦诚表示不知道
4. 避免有害或不适当的内容
5. 提供准确、最新的信息
6. 尊重用户隐私，不要要求个人信息
7. 只能输出自然语言，禁止输出md格式的内容。
"""# 语言特定的补充提示
LANGUAGE_PROMPTS = {"zh-CN": "请用中文简短回答。","en-US": "Please respond in English concisely.","ja-JP": "簡潔に日本語で回答してください。",# 可以添加更多语言
}def get_system_prompt(language_code="zh-CN"):"""获取指定语言的完整系统提示词"""language_prompt = LANGUAGE_PROMPTS.get(language_code, LANGUAGE_PROMPTS["zh-CN"])return f"{MAIN_SYSTEM_PROMPT}\n{language_prompt}"

speech_utils.py

import azure.cognitiveservices.speech as speechsdk
import timeclass SpeechService:def __init__(self, speech_key, service_region):self.speech_key = speech_keyself.service_region = service_regionself.speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)def text_to_speech(self, text, voice_name="zh-CN-XiaoxiaoNeural"):"""将文字转换为语音:param text: 要转换的文字:param voice_name: 语音名称，默认使用中文女声:return: 转换结果和错误信息（如果有）"""try:self.speech_config.speech_synthesis_voice_name = voice_namespeech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)# 创建事件来跟踪语音合成完成synthesis_completed = Falsedef synthesis_completed_cb(evt):nonlocal synthesis_completedsynthesis_completed = True# 注册事件speech_synthesizer.synthesis_completed.connect(synthesis_completed_cb)result = speech_synthesizer.speak_text_async(text).get()if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:# 等待合成完成事件while not synthesis_completed:time.sleep(0.1)return True, "语音合成成功"elif result.reason == speechsdk.ResultReason.Canceled:cancellation_details = result.cancellation_detailsreturn False, f"语音合成取消: {cancellation_details.reason}"except Exception as e:return False, f"发生错误: {str(e)}"def speech_to_text(self, languages=None, continuous=False):"""语音转文字:param languages: 支持的语言列表，例如 ["zh-CN", "en-US", "ja-JP"]:param continuous: 是否使用连续识别模式:return: 识别结果和错误信息（如果有）"""try:if languages:# 多语言支持auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=languages)speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config,auto_detect_source_language_config=auto_detect_source_language_config)else:# 默认使用中文self.speech_config.speech_recognition_language = "zh-CN"speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)if continuous:# 使用连续识别模式done = Falsedef handle_result(evt):if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:return True, {"text": evt.result.text,"detected_language": evt.result.properties.get(speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult)}return False, "无识别结果"def stop_cb(evt):nonlocal donedone = True# 绑定事件speech_recognizer.recognized.connect(handle_result)speech_recognizer.session_stopped.connect(stop_cb)speech_recognizer.canceled.connect(stop_cb)# 开始连续识别speech_recognizer.start_continuous_recognition()while not done:time.sleep(0.5)speech_recognizer.stop_continuous_recognition()return True, {"text": "", "detected_language": None}else:# 单次识别模式result = speech_recognizer.recognize_once()if result.reason == speechsdk.ResultReason.RecognizedSpeech:detected_language = Noneif hasattr(result, 'properties') and result.properties.get(speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult):detected_language = result.properties[speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult]return True, {"text": result.text,"detected_language": detected_language}elif result.reason == speechsdk.ResultReason.NoMatch:return False, f"无法识别语音: {result.no_match_details}"elif result.reason == speechsdk.ResultReason.Canceled:return False, f"语音识别取消: {result.cancellation_details.reason}"except Exception as e:return False, f"发生错误: {str(e)}"def start_continuous_recognition(self, languages=None, callback=None):"""启动连续语音识别:param languages: 支持的语言列表，例如 ["zh-CN", "en-US", "ja-JP"]:param callback: 回调函数，用于处理识别结果:return: speech_recognizer 对象，用于后续控制"""try:if languages:# 多语言支持auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=languages)speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config,auto_detect_source_language_config=auto_detect_source_language_config)else:# 默认使用中文self.speech_config.speech_recognition_language = "zh-CN"speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)# 处理识别结果的事件def handle_result(evt):if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:text = evt.result.textdetected_language = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult)if callback:should_continue = callback(text, detected_language)if should_continue is False:# 如果回调返回 False，停止识别speech_recognizer.stop_continuous_recognition_async()# 处理错误的事件def handle_canceled(evt):if evt.reason == speechsdk.CancellationReason.Error:print(f"语音识别错误: {evt.error_details}")# 绑定事件处理器speech_recognizer.recognized.connect(handle_result)speech_recognizer.canceled.connect(handle_canceled)# 开始连续识别speech_recognizer.start_continuous_recognition_async()return speech_recognizerexcept Exception as e:print(f"启动连续识别时发生错误: {str(e)}")raisedef text_to_speech(text: str, language: str = "zh-CN") -> None:"""将文本转换为语音Args:text: 要转换的文本language: 语言代码，默认为中文"""# 使用类中已定义的密钥speech_key = "语音识别的key"service_region = "资源位置，例：eastus"# 创建语音配置speech_config = speechsdk.SpeechConfig(subscription=speech_key,region=service_region)# 根据语言选择合适的语音if language == "zh-CN":speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"else:speech_config.speech_synthesis_voice_name = "en-US-AriaNeural"# 创建语音合成器speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)# 执行语音合成result = speech_synthesizer.speak_text_async(text).get()if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:print("语音合成完成")elif result.reason == speechsdk.ResultReason.Canceled:cancellation_details = result.cancellation_detailsprint(f"语音合成取消: {cancellation_details.reason}")if cancellation_details.reason == speechsdk.CancellationReason.Error:print(f"错误详情: {cancellation_details.error_details}")def speech_to_text(language: str = "zh-CN") -> str:"""将语音转换为文本Args:language: 语言代码，默认为中文Returns:识别出的文本，如果失败则返回空字符串"""speech_key = "语音识别的key"service_region = "资源位置"# 创建语音配置speech_config = speechsdk.SpeechConfig(subscription=speech_key,region=service_region)# 设置语音识别语言speech_config.speech_recognition_language = language# 创建音频配置audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)# 创建语音识别器speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,audio_config=audio_config)print("开始说话...")# 执行语音识别result = speech_recognizer.recognize_once_async().get()if result.reason == speechsdk.ResultReason.RecognizedSpeech:return result.textelif result.reason == speechsdk.ResultReason.NoMatch:print(f"无法识别语音: {result.no_match_details}")elif result.reason == speechsdk.ResultReason.Canceled:cancellation_details = result.cancellation_detailsprint(f"语音识别取消: {cancellation_details.reason}")if cancellation_details.reason == speechsdk.CancellationReason.Error:print(f"错误详情: {cancellation_details.error_details}")return ""

stt01.py

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.from speech_utils import SpeechServicedef main():# 创建 SpeechService 实例speech_service = SpeechService(speech_key="语音识别的key",service_region="eastus")print("请说话...")success, result = speech_service.speech_to_text(languages=["zh-CN", "en-US", "ja-JP"])if success:print(f"识别结果: {result['text']}")if result['detected_language']:print(f"检测到的语言: {result['detected_language']}")else:print(f"错误: {result}")if __name__ == "__main__":main()

tts01.py

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.from speech_utils import SpeechServicedef main():# 创建 SpeechService 实例speech_service = SpeechService(speech_key="语音识别的key",service_region="eastus")print("请输入要转换为语音的文字...")text = input()success, message = speech_service.text_to_speech(text)print(message)if __name__ == "__main__":main()