ui-tars和omni-parser使用

ui-tars部署和训练

说明
快速开始
- 环境准备
- ui-tars web推理和训练
- ui-tars api部署
- omni-parser使用

说明

镜像中包含ui-tars、llama-factory和omni-parser。该镜像还在审批中，估计明天可以上线，到时候可以在auto-dl中的社区镜像搜索。

快速开始

使用auto-dl镜像：
在这里插入图片描述

https://www.codewithgpu.com/i/hiyouga/LLaMA-Factory/ui-tars_omni-parser_llama-factory

环境准备

将模型从系统盘移动到数据盘，移动成功后可以选择删除原文件

cp -r /root/model/UI-TARS-7B-DPO /root/autodl-tmp/
cp -r /root/omni  /root/autodl-tmp/

ui-tars web推理和训练

bash /root/LLaMA-Factory/chuli/one.sh

高级设置的提示模板要改成qwen2_vl，否则无法上传图片
在这里插入图片描述
具体的使用方法可以查看llama-factory官方
https://github.com/hiyouga/LLaMA-Factory

ui-tars api部署

进入conda环境

conda activate llama

-tp 是指需要的gpu数量，改成1

python -m vllm.entrypoints.openai.api_server --served-model-name ui-tars \--model /root/autodl-tmp/UI-TARS-7B-DPO --limit-mm-per-prompt image=5 --dtype=half -tp 1

使用自定义服务进行映射，方便本地电脑调用：

ssh -CNg -L 8000:127.0.0.1:8000 root@region-9.autodl.pro -p 46525

本地电脑调用示例：

curl http://localhost:8000/v1/chat/completions \-H "Content-Type: application/json" \-d '{"model": "ui-tars","messages": [{"role": "user", "content": "我想问你，5的阶乘是多少？<think>\n"}]}'
{"id":"chat-7c8149f008a24adfa451a989ba6256d5","object":"chat.completion","created":1741314705,"model":"ui-tars","choices":[{"index":0,"message":{"role":"assistant",
"content":"5的阶乘是120。阶乘运算的数学符号是“!”。在计算机编程语言中，它通常用“ fact”来表示。阶乘的定义为：n! = n * (n - 1) * (n - 2) * ... * 2 * 1，其中n是一个正整数。","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":22,"total_tokens":97,"completion_tokens":75},"prompt_logprobs":null}%

test.py

from model import OpenAIModel, print_with_colorconfigs = {"DEEPSEEK_API_BASE": "http://localhost:8000/v1/chat/completions","DEEPSEEK_API_MODEL": "ui-tars","MAX_TOKENS": 1024,"TEMPERATURE": 0,"OPENAI_API_KEY": ''
}def ask(question: str):print_with_color("####################deepseek####################", "magenta")print_with_color(f"question: {question}", 'yellow')mllm = OpenAIModel(base_url=configs["DEEPSEEK_API_BASE"],api_key=configs["OPENAI_API_KEY"],model=configs["DEEPSEEK_API_MODEL"],temperature=configs["TEMPERATURE"],max_tokens=configs["MAX_TOKENS"],disable_proxies=True)prompt = questionimages = ['image1.jpg']status, rsp = mllm.get_model_response(prompt, images=images)if not status:print_with_color(f"失败，{rsp}", 'red')returnprint_with_color(f"*********************** rsp:\n{rsp}", "yellow")ask("解释下图片的内容")

model.py

from abc import abstractmethod
from typing import List
import base64
import requests
import sys
from typing import Tuple
from colorama import Fore, Styledef encode_image(image_path):with open(image_path, "rb") as image_file:return base64.b64encode(image_file.read()).decode('utf-8')
def print_with_color(text: str, color=""):if color == "red":print(Fore.RED + text)elif color == "green":print(Fore.GREEN + text)elif color == "yellow":print(Fore.YELLOW + text)elif color == "blue":print(Fore.BLUE + text)elif color == "magenta":print(Fore.MAGENTA + text)elif color == "cyan":print(Fore.CYAN + text)elif color == "white":print(Fore.WHITE + text)elif color == "black":print(Fore.BLACK + text)else:print(text)print(Style.RESET_ALL)class BaseModel:def __init__(self):pass@abstractmethoddef get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:passclass OpenAIModel(BaseModel):def __init__(self, base_url: str, api_key: str, model: str, temperature: float, max_tokens: int, disable_proxies=False):super().__init__()self.base_url = base_urlself.api_key = api_keyself.model = modelself.temperature = temperatureself.max_tokens = max_tokensself.disable_proxies = disable_proxiesdef get_model_response(self, prompt: str, images: List[str]=[], tools: list[dict]=None,history: list[dict]=None, role: str="user") -> Tuple[bool, str]:content = [{"type": "text","text": prompt}]for img in images:base64_img = encode_image(img)content.append({"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{base64_img}"}})headers = {"Content-Type": "application/json","Authorization": f"Bearer {self.api_key}"}payload = {"model": self.model,"messages": [{"role": role,"content": content}],"temperature": self.temperature,"max_tokens": self.max_tokens}if tools:payload["tools"] = toolsif history:history.append(payload['messages'][-1])payload['messages'] = historyif self.disable_proxies:response = requests.post(self.base_url, headers=headers, json=payload, proxies={}).json()else:response = requests.post(self.base_url, headers=headers, json=payload).json()if "error" not in str(response):if not 'usage' in response:print_with_color(f"not usage:{response}", 'res')else:usage = response["usage"]prompt_tokens = usage["prompt_tokens"]total_tokens = usage["total_tokens"]completion_tokens = usage["completion_tokens"]print_with_color(f"total_tokens: {total_tokens}, prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}")completion_tokens = usage["completion_tokens"]if self.model == "gpt-4o":print_with_color(f"Request gpt-4o cost is "f"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.005 + completion_tokens / 1000 * 0.015)}","yellow")else:print_with_color(f"Request cost is "f"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.01 + completion_tokens / 1000 * 0.03)}","yellow")else:print_with_color(f"执行失败，response: {response}", "red")return False, responseif tools:return True, response["choices"][0]["message"]["tool_calls"]else:return True, response["choices"][0]["message"]["content"]