文章目录 CodeGeex2 ChatGLM2_6B Baichuan2_13B sqlcoder 开启后测试
CodeGeex2
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
import uvicorn, json, datetime
import torch
import argparse
try : import chatglm_cppenable_chatglm_cpp = True
except : print ( "[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. " "Check out https://github.com/li-plus/chatglm.cpp for more details." ) enable_chatglm_cpp = False LANGUAGE_TAG = { "Abap" : "* language: Abap" , "ActionScript" : "// language: ActionScript" , "Ada" : "-- language: Ada" , "Agda" : "-- language: Agda" , "ANTLR" : "// language: ANTLR" , "AppleScript" : "-- language: AppleScript" , "Assembly" : "; language: Assembly" , "Augeas" : "// language: Augeas" , "AWK" : "// language: AWK" , "Basic" : "' language: Basic" , "C" : "// language: C" , "C#" : "// language: C#" , "C++" : "// language: C++" , "CMake" : "# language: CMake" , "Cobol" : "// language: Cobol" , "CSS" : "/* language: CSS */" , "CUDA" : "// language: Cuda" , "Dart" : "// language: Dart" , "Delphi" : "{language: Delphi}" , "Dockerfile" : "# language: Dockerfile" , "Elixir" : "# language: Elixir" , "Erlang" : f"% language: Erlang" , "Excel" : "' language: Excel" , "F#" : "// language: F#" , "Fortran" : "!language: Fortran" , "GDScript" : "# language: GDScript" , "GLSL" : "// language: GLSL" , "Go" : "// language: Go" , "Groovy" : "// language: Groovy" , "Haskell" : "-- language: Haskell" , "HTML" : "<!--language: HTML-->" , "Isabelle" : "(*language: Isabelle*)" , "Java" : "// language: Java" , "JavaScript" : "// language: JavaScript" , "Julia" : "# language: Julia" , "Kotlin" : "// language: Kotlin" , "Lean" : "-- language: Lean" , "Lisp" : "; language: Lisp" , "Lua" : "// language: Lua" , "Markdown" : "<!--language: Markdown-->" , "Matlab" : f"% language: Matlab" , "Objective-C" : "// language: Objective-C" , "Objective-C++" : "// language: Objective-C++" , "Pascal" : "// language: Pascal" , "Perl" : "# language: Perl" , "PHP" : "// language: PHP" , "PowerShell" : "# language: PowerShell" , "Prolog" : f"% language: Prolog" , "Python" : "# language: Python" , "R" : "# language: R" , "Racket" : "; language: Racket" , "RMarkdown" : "# language: RMarkdown" , "Ruby" : "# language: Ruby" , "Rust" : "// language: Rust" , "Scala" : "// language: Scala" , "Scheme" : "; language: Scheme" , "Shell" : "# language: Shell" , "Solidity" : "// language: Solidity" , "SPARQL" : "# language: SPARQL" , "SQL" : "-- language: SQL" , "Swift" : "// language: swift" , "TeX" : f"% language: TeX" , "Thrift" : "/* language: Thrift */" , "TypeScript" : "// language: TypeScript" , "Vue" : "<!--language: Vue-->" , "Verilog" : "// language: Verilog" , "Visual Basic" : "' language: Visual Basic" ,
} app = FastAPI( )
def device ( config, model_path) : if enable_chatglm_cpp and config. use_chatglm_cpp: print ( "Using chatglm-cpp to improve performance" ) dtype = "f16" if config. half else "f32" if config. quantize in [ 4 , 5 , 8 ] : dtype = f"q { config. quantize} _0" model = chatglm_cpp. Pipeline( model_path, dtype= dtype) return modelprint ( "chatglm-cpp not enabled, falling back to transformers" ) if config. device != "cpu" : if not config. half: model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) . cuda( int ( config. device) ) else : model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) . cuda( int ( config. device) ) . half( ) if config. quantize in [ 4 , 8 ] : print ( f"Model is quantized to INT { config. quantize} format." ) model = model. half( ) . quantize( config. quantize) else : model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) return model. eval ( ) @app. post ( "/" )
async def create_item ( request: Request) : global model, tokenizerjson_post_raw = await request. json( ) json_post = json. dumps( json_post_raw) json_post_list = json. loads( json_post) lang = json_post_list. get( 'lang' ) prompt = json_post_list. get( 'prompt' ) max_length = json_post_list. get( 'max_length' , 128 ) top_p = json_post_list. get( 'top_p' , 0.95 ) temperature = json_post_list. get( 'temperature' , 0.2 ) top_k = json_post_list. get( 'top_k' , 0 ) if lang != "None" : prompt = LANGUAGE_TAG[ lang] + "\n" + promptif enable_chatglm_cpp and use_chatglm_cpp: response = model. generate( prompt, max_length= max_length, do_sample= temperature > 0 , top_p= top_p, top_k= top_k, temperature= temperature) else : response = model. chat( tokenizer, prompt, max_length= max_length, top_p= top_p, top_k= top_k, temperature= temperature) now = datetime. datetime. now( ) time = now. strftime( "%Y-%m-%d %H:%M:%S" ) answer = { "response" : response, "lang" : lang, "status" : 200 , "time" : time} return answerdef api_start ( config) : global use_chatglm_cppuse_chatglm_cpp = config. use_chatglm_cppmodel_path = "CodeModels/CodeGeex2" global tokenizerglobal modeltokenizer = AutoTokenizer. from_pretrained( model_path, trust_remote_code= True ) model = device( config, model_path) uvicorn. run( app, host= "0.0.0.0" , port= 7861 , workers= 1 )
ChatGLM2_6B
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
import uvicorn, json, datetime
import torchdef torch_gc ( mydevice) : if torch. cuda. is_available( ) : with torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app = FastAPI( )
def device ( config, model_path) : if config. device != "cpu" : if not config. half: model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) . cuda( int ( config. device) ) else : model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) . cuda( int ( config. device) ) . half( ) if config. quantize in [ 4 , 8 ] : print ( f"Model is quantized to INT { config. quantize} format." ) model = model. half( ) . quantize( config. quantize) else : model = AutoModel. from_pretrained( model_path, trust_remote_code= True ) return model. eval ( ) @app. post ( "/" )
async def create_item ( request: Request) : global model, tokenizerjson_post_raw = await request. json( ) json_post = json. dumps( json_post_raw) json_post_list = json. loads( json_post) prompt = json_post_list. get( 'prompt' ) history = json_post_list. get( 'history' , [ ] ) max_length = json_post_list. get( 'max_length' , 2048 ) top_p = json_post_list. get( 'top_p' , 0.7 ) temperature = json_post_list. get( 'temperature' , 0.95 ) top_k = json_post_list. get( 'top_k' , 0 ) response, history = model. chat( tokenizer, prompt, history= history, max_length= max_length, top_p= top_p, temperature= temperature) now = datetime. datetime. now( ) time = now. strftime( "%Y-%m-%d %H:%M:%S" ) answer = { "response" : response, "history" : history, "status" : 200 , "time" : time} torch_gc( model. device) return answerdef api_start ( config) : model_path = "LanguageModels/ChatGLM2_6B/" global tokenizerglobal modeltokenizer = AutoTokenizer. from_pretrained( model_path, trust_remote_code= True ) model = device( config, model_path) uvicorn. run( app, host= "0.0.0.0" , port= 7862 , workers= 1 )
Baichuan2_13B
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers. generation. utils import GenerationConfig
import uvicorn, json, datetime
import torchdef torch_gc ( mydevice) : if torch. cuda. is_available( ) : with torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app = FastAPI( )
def device ( config, model_path) : model = AutoModelForCausalLM. from_pretrained( model_path, device_map= "auto" , torch_dtype= torch. bfloat16, trust_remote_code= True ) model. generation_config = GenerationConfig. from_pretrained( model_path) return model. eval ( ) @app. post ( "/" )
async def create_item ( request: Request) : global model, tokenizerjson_post_raw = await request. json( ) json_post = json. dumps( json_post_raw) json_post_list = json. loads( json_post) prompt = json_post_list. get( 'prompt' ) messages = [ ] messages. append( { "role" : "user" , "content" : prompt} ) response = model. chat( tokenizer, messages) now = datetime. datetime. now( ) time = now. strftime( "%Y-%m-%d %H:%M:%S" ) answer = { "response" : response, "status" : 200 , "time" : time} torch_gc( model. device) return answerdef api_start ( config) : model_path = "LanguageModels/Baichuan2_13B_Chat/" global tokenizerglobal modeltokenizer = AutoTokenizer. from_pretrained( model_path, use_fast= False , trust_remote_code= True ) model = device( config, model_path) uvicorn. run( app, host= "0.0.0.0" , port= 7863 , workers= 1 )
sqlcoder
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers. generation. utils import GenerationConfig
import uvicorn, json, datetime
import torchdef torch_gc ( mydevice) : if torch. cuda. is_available( ) : with torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app = FastAPI( )
def device ( config, model_path) : model = AutoModelForCausalLM. from_pretrained( model_path, device_map= "auto" , load_in_8bit= True , use_cache= True , trust_remote_code= True ) return model. eval ( ) @app. post ( "/" )
async def create_item ( request: Request) : global model, tokenizerjson_post_raw = await request. json( ) json_post = json. dumps( json_post_raw) json_post_list = json. loads( json_post) prompt = json_post_list. get( 'prompt' ) eos_token_id = tokenizer. convert_tokens_to_ids( [ "```" ] ) [ 0 ] inputs = tokenizer( prompt, return_tensors= "pt" ) . to( "cuda" ) generated_ids = model. generate( ** inputs, num_return_sequences= 1 , eos_token_id= eos_token_id, pad_token_id= eos_token_id, max_new_tokens= 400 , do_sample= False , num_beams= 5 ) outputs = tokenizer. batch_decode( generated_ids, skip_special_tokens= True ) response = outputs[ 0 ] . split( "```sql" ) [ - 1 ] . split( "```" ) [ 0 ] . split( ";" ) [ 0 ] . strip( ) + ";" now = datetime. datetime. now( ) time = now. strftime( "%Y-%m-%d %H:%M:%S" ) answer = { "response" : response, "status" : 200 , "time" : time} torch_gc( model. device) return answerdef api_start ( config) : model_path = "CodeModels/sqlcoder/" global tokenizerglobal modeltokenizer = AutoTokenizer. from_pretrained( model_path, trust_remote_code= True ) model = device( config, model_path) uvicorn. run( app, host= "0.0.0.0" , port= 7864 , workers= 1 )
开启后测试
curl - X POST "http://127.0.0.1:7864 -H 'Content-Type: application/json' -d '{" prompt": " 你的名字是"} '