Bert实现文本分类微调Demo
import random
from collections import namedtuple'''
有四种文本需要做分类,请使用bert处理这个分类问题
'''
Category = namedtuple( 'Category' , [ 'name' , 'samples' ] )
categories = [ Category( 'Weather Forecast' , [ '今天北京晴转多云,气温20-25度。' , '明天上海有小雨,记得带伞。' ] ) , Category( 'Company Financial Report' , [ '本季度公司净利润增长20%。' , '年度财务报告显示,成本控制良好。' ] ) , Category( 'Company Audit Materials' , [ '审计发现内部控制存在漏洞。' , '审计确认财务报表无重大错报。' ] ) , Category( 'Product Marketing Ad' , [ '新口味可乐,清爽上市!' , '买一送一,仅限今日。' ] )
] def generate_data ( num_samples_per_category= 50 ) : ''' 生成模拟数据集输入:- num_samples_per_category: 每个类别生成的样本数量,默认为50输出:- data: 包含文本样本及其对应类别的列表,每项为一个元组(text, label)''' data = [ ] for category in categories: for _ in range ( num_samples_per_category) : sample = random. choice( category. samples) data. append( ( sample, category. name) ) return data
train_data = generate_data( 100 )
test_data = generate_data( 6 ) '''
train_data =
[('明天上海有小雨,记得带伞。', 'Weather Forecast'),('明天上海有小雨,记得带伞。', 'Weather Forecast'),('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),('明天上海有小雨,记得带伞。', 'Weather Forecast'),('明天上海有小雨,记得带伞。', 'Weather Forecast'),('明天上海有小雨,记得带伞。', 'Weather Forecast'),('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),]
''' from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch. utils. data import DataLoader, TensorDataset
import torch
import torch. nn. functional as F
label_map = { category. name: index for index, category in enumerate ( categories) }
num_labels = len ( categories)
tokenizer = BertTokenizer. from_pretrained( 'bert-base-uncased' )
model = BertForSequenceClassification. from_pretrained( 'bert-base-uncased' , num_labels= num_labels)
def encode_texts ( texts, labels) : encodings = tokenizer( texts, truncation= True , padding= True , return_tensors= 'pt' ) label_ids = torch. tensor( [ label_map[ label] for label in labels] ) return encodings, label_idsdef prepare_data ( data) : texts, labels = zip ( * data) encodings, label_ids = encode_texts( texts, labels) dataset = TensorDataset( encodings[ 'input_ids' ] , encodings[ 'attention_mask' ] , label_ids) return DataLoader( dataset, batch_size= 8 , shuffle= True )
train_loader = prepare_data( train_data)
test_loader = prepare_data( test_data)
device = torch. device( 'cuda' if torch. cuda. is_available( ) else 'cpu' )
model. to( device) def train_epoch ( model, data_loader, optimizer) : model. train( ) total_loss = 0 for batch in data_loader: optimizer. zero_grad( ) input_ids, attention_mask, labels = batchinput_ids, attention_mask, labels = input_ids. to( device) , attention_mask. to( device) , labels. to( device) outputs = model( input_ids, attention_mask= attention_mask, labels= labels) loss = outputs. losstotal_loss += loss. item( ) loss. backward( ) optimizer. step( ) return total_loss / len ( data_loader) def evaluate ( model, data_loader) : model. eval ( ) total_acc = 0 total_count = 0 with torch. no_grad( ) : for batch in data_loader: input_ids, attention_mask, labels = batchinput_ids, attention_mask, labels = input_ids. to( device) , attention_mask. to( device) , labels. to( device) outputs = model( input_ids, attention_mask= attention_mask) predictions = torch. argmax( outputs. logits, dim= 1 ) total_acc += ( predictions == labels) . sum ( ) . item( ) total_count += labels. size( 0 ) return total_acc / total_count
optimizer = AdamW( model. parameters( ) , lr= 2e-5 ) for epoch in range ( 3 ) : train_loss = train_epoch( model, train_loader, optimizer) acc = evaluate( model, test_loader) print ( f'Epoch { epoch+ 1 } , Train Loss: { train_loss} , Test Accuracy: { acc* 100 : .2f } %' )
def predict ( text) : encodings = tokenizer( text, truncation= True , padding= True , return_tensors= 'pt' ) input_ids = encodings[ 'input_ids' ] . to( device) attention_mask = encodings[ 'attention_mask' ] . to( device) with torch. no_grad( ) : outputs = model( input_ids, attention_mask= attention_mask) predicted_class_id = torch. argmax( outputs. logits) . item( ) return categories[ predicted_class_id] . name
new_text = [ "明天的天气怎么样?" ]
predicted_category = predict( new_text)
print ( f'The predicted category for the new text is: { predicted_category} ' )