API服务的快速搭建和测试
使用Python的FastAPI迅速搭建一个简单API
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
import uvicorn, json, datetime
import torch# 设置CUDA设备信息
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE# 清理CUDA缓存的函数
def torch_gc():if torch.cuda.is_available():with torch.cuda.device(CUDA_DEVICE):torch.cuda.empty_cache()torch.cuda.ipc_collect()# 创建FastAPI应用
app = FastAPI()# 定义POST请求的处理函数
@app.post("/")
async def create_item(request: Request):global model, tokenizer# 从请求中获取JSON数据json_post_raw = await request.json()json_post = json.dumps(json_post_raw)json_post_list = json.loads(json_post)# 从JSON数据中提取必要的参数prompt = json_post_list.get('prompt')history = json_post_list.get('history')max_length = json_post_list.get('max_length')top_p = json_post_list.get('top_p')temperature = json_post_list.get('temperature')# 调用模型生成聊天响应response, history = model.chat(tokenizer,prompt,history=history,max_length=max_length if max_length else 2048,top_p=top_p if top_p else 0.8,temperature=temperature if temperature else 0.8)# 获取当前时间now = datetime.datetime.now()time = now.strftime("%Y-%m-%d %H:%M:%S")# 构建响应对象answer = {"response": response,"history": history,"status": 200,"time": time}# 构建日志信息log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'print(log)# 调用函数清理CUDA缓存torch_gc()# 返回响应return answer# 主程序入口
if __name__ == '__main__':# 加载模型和分词器tokenizer = AutoTokenizer.from_pretrained("../base_model/chatglm3-6b", trust_remote_code=True)model = AutoModel.from_pretrained("../base_model/chatglm3-6b", trust_remote_code=True).cuda()model.eval()# 启动FastAPI应用uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
使用Python调用API
import requests# 定义请求URL
url = "http://实际API服务地址:8000"# 定义请求头
headers = {"Content-Type": "application/json"
}# 定义请求体数据
data = {"prompt": "你好","history": []
}# 发送POST请求
response = requests.post(url, headers=headers, json=data)# 打印响应
print(response.text)