这也是一个通用的方案,使用peft微调LLM。
准备自己的数据集
根据情况改就行了,jsonl格式,三个字段:context, answer, question
import pandas as pd
import random
import jsondata = pd.read_csv('dataset.csv')
train_data = data[['prompt','Code']]
train_data = train_data.values.tolist()random.shuffle(train_data)train_num = int(0.8 * len(train_data))with open('train_data.jsonl', 'w') as f:for d in train_data[:train_num]:d = {'context':'','question':d[0],'answer':d[1]}f.write(json.dumps(d)+'\n')
with open('val_data.jsonl', 'w') as f:for d in train_data[train_num:]:d = {'context':'','question':d[0],'answer':d[1]}f.write(json.dumps(d)+'\n')
初始化
from datetime import datetime
import os
import sysimport torchfrom peft import (LoraConfig,get_peft_model,get_peft_model_state_dict,prepare_model_for_int8_training,
)
from transformers import (AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM,TrainingArguments, Trainer, DataCollatorForSeq2Seq)# 加载自己的数据集
from datasets import load_datasettrain_dataset = load_dataset('json', data_files='train_data.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='val_data.jsonl', split='train')# 读取模型
base_model = 'CodeLlama-7b-Instruct-hf'model = AutoModelForCausalLM.from_pretrained(base_model,load_in_8bit=True,torch_dtype=torch.float16,device_map="auto",low_cpu_mem_usage=True
)tokenizer = AutoTokenizer.from_pretrained(base_model)
微调前的效果
tokenizer.pad_token = tokenizer.eos_token
prompt = """You are programming coder.Now answer the question:{}"""
prompts = [prompt.format(train_dataset[i]['question']) for i in [1,20,32,45,67]]model_input = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")model.eval()
with torch.no_grad():outputs = model.generate(**model_input, max_new_tokens=300)outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)print(outputs)
进行微调
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"def tokenize(prompt):result = tokenizer(prompt,truncation=True,max_length=512,padding=False,return_tensors=None,)# "self-supervised learning" means the labels are also the inputs:result["labels"] = result["input_ids"].copy()return resultdef generate_and_tokenize_prompt(data_point):full_prompt =f"""You are a powerful programming model. Your job is to answer questions about a database. You are given a question.You must output the code that answers the question.### Input:
{data_point["question"]}### Response:
{data_point["answer"]}
"""return tokenize(full_prompt)tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)config = LoraConfig(r=16,lora_alpha=16,target_modules=["q_proj","k_proj","v_proj","o_proj",
],lora_dropout=0.05,bias="none",task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
if torch.cuda.device_count() > 1:model.is_parallelizable = Truemodel.model_parallel = Truebatch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama-ft"training_args = TrainingArguments(per_device_train_batch_size=per_device_train_batch_size,gradient_accumulation_steps=gradient_accumulation_steps,warmup_steps=100,max_steps=400,learning_rate=3e-4,fp16=True,logging_steps=10,optim="adamw_torch",evaluation_strategy="steps", # if val_set_size > 0 else "no",save_strategy="steps",eval_steps=20,save_steps=20,output_dir=output_dir,load_best_model_at_end=False,group_by_length=True, # group sequences of roughly the same length together to speed up trainingreport_to="none", # if use_wandb else "none", wandbrun_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,)trainer = Trainer(model=model,train_dataset=tokenized_train_dataset,eval_dataset=tokenized_val_dataset,args=training_args,data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)
开始训练
model.config.use_cache = Falseold_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":print("compiling the model")model = torch.compile(model)
trainer.train()
进行测试
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizerbase_model = 'CodeLlama-7b-Instruct-hf'
model = AutoModelForCausalLM.from_pretrained(base_model,load_in_8bit=True,torch_dtype=torch.float16,device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)output_dir = "code-llama-ft"
model = PeftModel.from_pretrained(model, output_dir)eval_prompt = """You are a powerful programming model. Your job is to answer questions about a database. You are given a question.You must output the code that answers the question.### Input:
Write a function in Java that takes an array and returns the sum of the numbers in the array, or 0 if the array is empty. Except the number 13 is very unlucky, so it does not count any 13, or any number that immediately follows a 13.### Response:
"""model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")model.eval()
with torch.no_grad():outputs = model.generate(**model_input, max_new_tokens=100)[0]
print(tokenizer.decode(outputs, skip_special_tokens=True))
主要参考https://zhuanlan.zhihu.com/p/660933421