from transformers import AutoTokenizer
tokenizer = AutoTokenizer. from_pretrained( 'Helsinki-NLP/opus-mt-en-ro' , use_fast= True ) print ( tokenizer)
tokenizer. batch_encode_plus( [ [ 'Hello, this one sentence!' , 'This is another sentence.' ] ] )
PreTrainedTokenizer( name_or_path= 'Helsinki-NLP/opus-mt-en-ro' , vocab_size= 59543 , model_max_len= 512 , is_fast= False , padding_side= 'right' , truncation_side= 'right' , special_tokens= { 'eos_token' : '</s>' , 'unk_token' : '<unk>' , 'pad_token' : '<pad>' } )
{ 'input_ids' : [ [ 125 , 778 , 3 , 63 , 141 , 9191 , 23 , 187 , 32 , 716 , 9191 , 2 , 0 ] ] , 'attention_mask' : [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ] }
from datasets import load_dataset, load_from_disk
dataset = load_dataset( path= 'wmt16' , name= 'ro-en' )
dataset[ 'train' ] = dataset[ 'train' ] . shuffle( 1 ) . select( range ( 20000 ) )
dataset[ 'validation' ] = dataset[ 'validation' ] . shuffle( 1 ) . select( range ( 200 ) )
dataset[ 'test' ] = dataset[ 'test' ] . shuffle( 1 ) . select( range ( 200 ) )
def preprocess_function ( data) : en = [ ex[ 'en' ] for ex in data[ 'translation' ] ] ro = [ ex[ 'ro' ] for ex in data[ 'translation' ] ] data = tokenizer. batch_encode_plus( en, max_length= 128 , truncation= True ) with tokenizer. as_target_tokenizer( ) : data[ 'labels' ] = tokenizer. batch_encode_plus( ro, max_length= 128 , truncation= True ) [ 'input_ids' ] return datadataset = dataset. map ( function= preprocess_function, batched= True , batch_size= 1000 , num_proc= 4 , remove_columns= [ 'translation' ] ) print ( dataset[ 'train' ] [ 0 ] ) dataset
{ 'input_ids' : [ 460 , 354 , 3794 , 12 , 10677 , 20 , 5046 , 14 , 4 , 2546 , 37 , 8 , 397 , 5551 , 30 , 10113 , 37 , 3501 , 19814 , 18 , 8465 , 20 , 4 , 44690 , 782 , 2 , 0 ] , 'attention_mask' : [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] , 'labels' : [ 902 , 576 , 2946 , 76 , 10815 , 17 , 5098 , 14997 , 5 , 559 , 1140 , 43 , 2434 , 6624 , 27 , 50 , 337 , 19216 , 46 , 22174 , 17 , 2317 , 121 , 16825 , 2 , 0 ] }
DatasetDict( { train: Dataset( { features: [ 'input_ids' , 'attention_mask' , 'labels' ] , num_rows: 20000 } ) validation: Dataset( { features: [ 'input_ids' , 'attention_mask' , 'labels' ] , num_rows: 200 } ) test: Dataset( { features: [ 'input_ids' , 'attention_mask' , 'labels' ] , num_rows: 200 } )
} )
import torch
def collate_fn ( data) : max_length = max ( [ len ( i[ 'labels' ] ) for i in data] ) for i in data: pads = [ - 100 ] * ( max_length - len ( i[ 'labels' ] ) ) i[ 'labels' ] = i[ 'labels' ] + padsdata = tokenizer. pad( encoded_inputs= data, padding= True , max_length= None , pad_to_multiple_of= None , return_tensors= 'pt' , ) data[ 'decoder_input_ids' ] = torch. full_like( data[ 'labels' ] , tokenizer. get_vocab( ) [ '<pad>' ] , dtype= torch. long ) data[ 'decoder_input_ids' ] [ : , 1 : ] = data[ 'labels' ] [ : , : - 1 ] data[ 'decoder_input_ids' ] [ data[ 'decoder_input_ids' ] == - 100 ] = tokenizer. get_vocab( ) [ '<pad>' ] return datadata = [ { 'input_ids' : [ 21603 , 10 , 37 , 3719 , 13 ] , 'attention_mask' : [ 1 , 1 , 1 , 1 , 1 ] , 'labels' : [ 10455 , 120 , 80 ]
} , { 'input_ids' : [ 21603 , 10 , 7086 , 8408 , 563 ] , 'attention_mask' : [ 1 , 1 , 1 , 1 , 1 ] , 'labels' : [ 301 , 53 , 4074 , 1669 ]
} ] collate_fn( data) [ 'decoder_input_ids' ]
tensor( [ [ 59542 , 10455 , 120 , 80 ] , [ 59542 , 301 , 53 , 4074 ] ] )
import torch
loader = torch. utils. data. DataLoader( dataset= dataset[ 'train' ] , batch_size= 8 , collate_fn= collate_fn, shuffle= True , drop_last= True ,
) for i, data in enumerate ( loader) : break for k, v in data. items( ) : print ( k, v. shape, v[ : 2 ] ) len ( loader)
from transformers import AutoModelForSeq2SeqLM, MarianModel
class Model ( torch. nn. Module) : def __init__ ( self) : super ( ) . __init__( ) self. pretrained = MarianModel. from_pretrained( 'Helsinki-NLP/opus-mt-en-ro' ) self. register_buffer( 'final_logits_bias' , torch. zeros( 1 , tokenizer. vocab_size) ) self. fc = torch. nn. Linear( 512 , tokenizer. vocab_size, bias= False ) parameters = AutoModelForSeq2SeqLM. from_pretrained( 'Helsinki-NLP/opus-mt-en-ro' ) self. fc. load_state_dict( parameters. lm_head. state_dict( ) ) self. criterion = torch. nn. CrossEntropyLoss( ) def forward ( self, input_ids, attention_mask, labels, decoder_input_ids) : logits = self. pretrained( input_ids= input_ids, attention_mask= attention_mask, decoder_input_ids= decoder_input_ids) logits = logits. last_hidden_statelogits = self. fc( logits) + self. final_logits_biasloss = self. criterion( logits. flatten( end_dim= 1 ) , labels. flatten( ) ) return { 'loss' : loss, 'logits' : logits} model = Model( )
print ( sum ( i. numel( ) for i in model. parameters( ) ) / 10000 )
from datasets import load_metric
metric = load_metric( path= 'sacrebleu' )
metric. compute( predictions= [ 'hello there' , 'general kenobi' ] , references= [ [ 'hello there' ] , [ 'general kenobi' ] ] )
测试
def test ( ) : model. eval ( ) loader_test = torch. utils. data. DataLoader( dataset= dataset[ 'test' ] , batch_size= 8 , collate_fn= collate_fn, shuffle= True , drop_last= True , ) predictions = [ ] references = [ ] for i, data in enumerate ( loader_test) : with torch. no_grad( ) : out = model( ** data) pred = tokenizer. batch_decode( out[ 'logits' ] . argmax( dim= 2 ) ) label = tokenizer. batch_decode( data[ 'decoder_input_ids' ] ) predictions. extend( pred) references. extend( label) if i % 2 == 0 : print ( i) input_ids = tokenizer. decode( data[ 'input_ids' ] [ 0 ] ) print ( 'input_ids=' , input_ids) print ( 'pred=' , pred[ 0 ] ) print ( 'label=' , label[ 0 ] ) if i == 10 : break references = [ [ j] for j in references] metric_out = metric. compute( predictions= predictions, references= references) print ( metric_out) test( )
from transformers import AdamW
from transformers. optimization import get_scheduler
def train ( ) : optimizer = AdamW( model. parameters( ) , lr= 2e-5 ) scheduler = get_scheduler( name= 'linear' , num_warmup_steps= 0 , num_training_steps= len ( loader) , optimizer= optimizer) model. train( ) for i, data in enumerate ( loader) : out = model( ** data) loss = out[ 'loss' ] loss. backward( ) torch. nn. utils. clip_grad_norm_( model. parameters( ) , 1.0 ) optimizer. step( ) scheduler. step( ) optimizer. zero_grad( ) model. zero_grad( ) if i % 50 == 0 : out = out[ 'logits' ] . argmax( dim= 2 ) correct = ( data[ 'decoder_input_ids' ] == out) . sum ( ) . item( ) total = data[ 'decoder_input_ids' ] . shape[ 1 ] * 8 accuracy = correct / totaldel correctdel totalpredictions = [ ] references = [ ] for j in range ( 8 ) : pred = tokenizer. decode( out[ j] ) label = tokenizer. decode( data[ 'decoder_input_ids' ] [ j] ) predictions. append( pred) references. append( [ label] ) metric_out = metric. compute( predictions= predictions, references= references) lr = optimizer. state_dict( ) [ 'param_groups' ] [ 0 ] [ 'lr' ] print ( i, loss. item( ) , accuracy, metric_out, lr) torch. save( model, 'models/7.翻译.model' ) train( )
model = torch. load( 'models/7.翻译.model' )
test( )