1、用ms-swift启动模型
2、用python实现http调用
3、用并发实现同时调用
import requests
class Qwen:api_key = 'EMPTY'base_url = "http://127.0.0.1:8001/v1/chat/completions"name = 'qwen2_5-72b-instruct'class Llama:api_key = 'EMPTY'base_url = "http://127.0.0.1:8001/v1/chat/completions"name = 'llama3-70b-instruct'def chat_with_gpt(api_key, messages, model="gpt-3.5-turbo", temperature=0.3, stream=False, url="https://api.openai.com/v1/chat/completions"):"""与OpenAI ChatGPT API交互的函数。参数:api_key (str): OpenAI API密钥。messages (list): 消息列表,格式为 [{"role": "user", "content": "你的消息"}].model (str): 使用的模型,默认为 "gpt-3.5-turbo".temperature (float): 温度参数,控制生成文本的随机性,默认为 0.7.stream (bool): 是否启用流式输出,默认为 False.url (str): API端点,默认为 "https://api.openai.com/v1/chat/completions".返回:如果 stream=False,返回完整的响应内容。如果 stream=True,返回一个生成器,逐步生成响应片段。"""# 请求头headers = {"Content-Type": "application/json","Authorization": f"Bearer {api_key}"}import json# 请求体data = {"model": model,"messages": messages,"temperature": temperature,"stream": stream}print('header',json.dumps(headers,ensure_ascii=False))print('data',json.dumps(data,ensure_ascii=False))# 发送请求if stream:# 流式请求response = requests.post(url, headers=headers, json=data, stream=True)return _handle_stream_response(response)else:# 非流式请求response = requests.post(url, headers=headers, json=data)return response.json()def _handle_stream_response(response):"""处理流式响应的生成器函数。参数:response: requests.Response 对象。返回:生成器,逐步生成响应片段。"""for chunk in response.iter_lines():if chunk:decoded_chunk = chunk.decode("utf-8")yield decoded_chunk
message = [{"role": "user", "content": "你是谁"}]
import time
# t1 = time.time()
# r1 = chat_with_gpt(Qwen.api_key, message, model=Qwen.name,url=Qwen.base_url)
# t2 = time.time()
# print(t2-t1)
#
# t1 = time.time()
# r2 = chat_with_gpt(Llama.api_key, message, model=Llama.name,url=Llama.base_url)
# t2 = time.time()
# print(t2-t1)
# print(r2)
#print(r2)def chat_with_model(model_class, message):"""并发请求的辅助函数,用于与特定模型交互。参数:model_class: 模型类(如 Qwen 或 Llama)。message (list): 消息列表,格式为 [{"role": "user", "content": "你的消息"}].返回:模型的响应内容。"""t1 = time.time()response = chat_with_gpt(model_class.api_key, message, model=model_class.name, url=model_class.base_url)t2 = time.time()print(f"Time taken for {model_class.name}: {t2 - t1} seconds")return response,round(t2-t1,4)import concurrent.futures
# 定义不同的消息
messages = [[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],[{"role": "user", "content": "你是谁"}],]# 定义模型列表
models = [ Llama]
durations = []
# 并发请求多个模型和不同的消息
with concurrent.futures.ThreadPoolExecutor() as executor:# 提交任务,每个任务使用不同的模型和消息futures = {executor.submit(chat_with_model, model, message): (model, message)for model in modelsfor message in messages}# 获取结果for future in concurrent.futures.as_completed(futures):model, message = futures[future]try:response,duration = future.result()durations.append(duration)print(f"Response from {model.name} with message {message}: {response}")except Exception as e:print(f"Error occurred with {model.name} and message {message}: {e}")
# llama
#[1.487, 5.2435, 6.6109, 7.9721, 9.3312, 10.737, 15.1935, 16.5562, 17.9155, 19.2751]# qwen
#[8.5853, 18.9962, 29.1533, 37.4593, 45.8992, 54.3415, 62.7779, 72.2762, 82.2203, 92.1694]print(durations)