BERT 微调中文 NER 模型

查看GPU数量和型号

import torch# 检查CUDA是否可用
if torch.cuda.is_available():print("CUDA is available!")# 还可以获取CUDA设备的数量device_count = torch.cuda.device_count()print(f"Number of CUDA devices: {device_count}")# 获取第一块GPU的信息device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')print(f"Device name: {torch.cuda.get_device_name(device)}")# 或者进一步获取GPU的详细能力信息capability = torch.cuda.get_device_capability(device)print(f"Device capability: {capability}")
else:print("CUDA is not available.")
CUDA is available!
Number of CUDA devices: 4
Device name: NVIDIA GeForce RTX 2080 Ti
Device capability: (7, 5)

处理原始数据

加载tokenizer

from transformers import AutoTokenizer, AutoModelForTokenClassification, BertTokenizerFast, BertForTokenClassification
from transformers import pipelinetokenizer = BertTokenizerFast.from_pretrained('models/bert-base-chinese')

基于 tokenizer 切词并转换BIO标签,过滤指定的NER类别

def generate_bio_tags(tokenizer, text_json, allowed_type = {"name", "organization", "government", "address", "company"}):def tokenize_with_location(tokenizer, input_data):encoded_input = tokenizer.encode_plus(input_data, return_offsets_mapping=True)return list(zip([tokenizer.decode(i) for i in  encoded_input.input_ids],encoded_input.offset_mapping))def get_bio_tag(labels, token_start, token_end):if token_start >= token_end:return "O"for entity_type, entities in labels.items():if entity_type in allowed_type:for entity_name, positions in entities.items():for position in positions:start, end = positionif token_start >= start and token_end <= end+1:if token_start == start:return f"B-{entity_type}"else:return f"I-{entity_type}"return "O"text = text_json["text"]labels = text_json["label"]# 使用BERT分词器进行分词tokenized_text = tokenize_with_location(tokenizer, text)tokens, bio_tags = [], []for token, loc in tokenized_text:loc_s, loc_e = locbio_tag = get_bio_tag(labels, loc_s, loc_e)bio_tags.append(bio_tag)tokens.append(token)return tokens, bio_tags# 输入JSON数据
input_json = {"text": "你们是最棒的!#英雄联盟d学sanchez创作的原声王", "label": {"game": {"英雄联盟": [[8, 11]]}}}
generate_bio_tags(tokenizer, input_json)
(['[CLS]','你','们','是','最','棒','的','!','#','英','雄','联','盟','d','学','san','##che','##z','创','作','的','原','声','王','[SEP]'],['O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O'])

加载数据

从文件读取数据集

from tqdm.notebook import tqdm
import jsontrain_file = 'train.json'
dataset = []
with open(train_file, 'r') as file:for line in tqdm(file.readlines()):data = json.loads(line.strip())tokens, bio_tags = generate_bio_tags(tokenizer, data)if len(set(bio_tags)) > 1:dataset.append({"text": data["text"], "tokens": tokens, "tags": bio_tags})
dataset[0]
  0%|          | 0/10748 [00:00<?, ?it/s]{'text': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为,对目前国内商业银行而言,','tokens': ['[CLS]','浙','商','银','行','企','业','信','贷','部','叶','老','桂','博','士','则','从','另','一','个','角','度','对','五','道','门','槛','进','行','了','解','读','。','叶','老','桂','认','为',',','对','目','前','国','内','商','业','银','行','而','言',',','[SEP]'],'tags': ['O','B-company','I-company','I-company','I-company','O','O','O','O','O','B-name','I-name','I-name','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O']}

自定义 Dataset

from itertools import product
from torch.utils.data import Dataset, DataLoaderlabels = ["O"] + [f"{i}-{j}" for i,j in product(['B','I'],['name', 'address', 'organization', 'government', 'company'])]
label2id = {k: v for v, k in enumerate(labels)}
id2label = {v: k for v, k in enumerate(labels)}class BertDataset(Dataset):def __init__(self, dataset, tokenizer, max_len):self.len = len(dataset)self.data = datasetself.tokenizer = tokenizerself.max_len = max_lendef __getitem__(self, index):# step 1: tokenize (and adapt corresponding labels)item = self.data[index]# step 2: add special tokens (and corresponding labels)tokenized_sentence = item["tokens"]labels = item["tags"] # add outside label for [CLS] token# step 3: truncating/paddingmaxlen = self.max_lenif (len(tokenized_sentence) > maxlen):# truncatetokenized_sentence = tokenized_sentence[:maxlen]labels = labels[:maxlen]else:# padtokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]labels = labels + ["O" for _ in range(maxlen - len(labels))]# step 4: obtain the attention maskattn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]# step 5: convert tokens to input idsids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)label_ids = [label2id[label] for label in labels]# the following line is deprecated#label_ids = [label if label != 0 else -100 for label in label_ids]return {'ids': torch.tensor(ids, dtype=torch.long),'mask': torch.tensor(attn_mask, dtype=torch.long),#'token_type_ids': torch.tensor(token_ids, dtype=torch.long),'targets': torch.tensor(label_ids, dtype=torch.long)} def __len__(self):return self.len
mydata =  BertDataset(dataset, tokenizer, 128)
mydata[100]
{'ids': tensor([ 101,  123, 5101, 4638, 6631, 1920, 7481, 2160,  510,  124,  119, 8137,5101, 4638, 6631, 7770, 2231, 7770, 5023, 1166, 1863, 5277,  772, 1501,6574, 5162, 1277, 1818, 1086, 3187, 2124, 1905,  511, 2945, 1909, 2014,1929, 3717, 2279,  122, 1384, 4685, 1068, 5852, 7218,  782, 1447,  792,5305, 8024,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0]),'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0]),'targets': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0])}

BERT模型微调

定义常量

MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

拆分训练测试集

import numpy as np
import random
def split_train_test_valid(dataset, train_size=0.9, test_size=0.1):dataset = np.array(dataset)total_size = len(dataset)# define the ratiostrain_len = int(total_size * train_size)test_len = int(total_size * test_size)# split the dataframeidx = list(range(total_size))random.shuffle(idx)  # 将index列表打乱data_train = dataset[idx[:train_len]]data_test = dataset[idx[train_len:train_len+test_len]]data_valid = dataset[idx[train_len+test_len:]]  # 剩下的就是validreturn data_train, data_test, data_validMAX_LEN = 128
data_train, data_test, data_valid = split_train_test_valid(dataset)
print("FULL Dataset: {}".format(len(dataset)))
print("TRAIN Dataset: {}".format(data_train.shape))
print("TEST Dataset: {}".format(data_test.shape))
training_set = BertDataset(data_train, tokenizer, MAX_LEN)
testing_set = BertDataset(data_test, tokenizer, MAX_LEN)
FULL Dataset: 7824
TRAIN Dataset: (7041,)
TEST Dataset: (782,)
training_set[0]
{'ids': tensor([ 101, 1925, 6121, 1184, 3667, 3198, 7313, 1139, 1378, 4638, 2791, 6587,3173, 3124, 2190,  702,  782,  857, 2791, 6587, 3621, 3300, 3209, 3227,4638, 2861, 1220, 8024,  100,  794,  769, 6121, 4638, 2658, 1105, 3341,4692, 8024, 2356, 1767, 3300, 1726, 3265, 4638, 6839, 6496,  511,  100,102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0]),'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0]),'targets': tensor([ 0,  4,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5, 10,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,0,  0])}
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):print('{0:10}  {1}'.format(token, id2label[label.item()]))
[CLS]       O
央           B-government
行           I-government
前           O
段           O
时           O
间           O
出           O
台           O
的           O
房           O
贷           O
新           O
政           O
对           O
个           O
人           O
住           O
房           O
贷           O
款           O
有           O
明           O
显           O
的           O
拉           O
动           O
,           O
[UNK]       O
从           O

模型训练

train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
model = AutoModelForTokenClassification.from_pretrained('models/bert-base-chinese', num_labels=len(id2label),id2label=id2label,label2id=label2id)
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
model.to(device)
Some weights of the model checkpoint at models/bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.cudaBertForTokenClassification((bert): BertModel((embeddings): BertEmbeddings((word_embeddings): Embedding(21128, 768, padding_idx=0)(position_embeddings): Embedding(512, 768)(token_type_embeddings): Embedding(2, 768)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False))(encoder): BertEncoder((layer): ModuleList((0): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(1): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(2): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(3): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(4): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(5): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(6): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(7): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(8): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(9): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(10): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(11): BertLayer((attention): BertAttention((self): BertSelfAttention((query): Linear(in_features=768, out_features=768, bias=True)(key): Linear(in_features=768, out_features=768, bias=True)(value): Linear(in_features=768, out_features=768, bias=True)(dropout): Dropout(p=0.1, inplace=False))(output): BertSelfOutput((dense): Linear(in_features=768, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False)))(intermediate): BertIntermediate((dense): Linear(in_features=768, out_features=3072, bias=True)(intermediate_act_fn): GELUActivation())(output): BertOutput((dense): Linear(in_features=3072, out_features=768, bias=True)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False))))))(dropout): Dropout(p=0.1, inplace=False)(classifier): Linear(in_features=768, out_features=11, bias=True)
)
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss
tensor(2.4526, device='cuda:0', grad_fn=<NllLossBackward0>)
tr_logits = outputs[1]
tr_logits.shape
torch.Size([1, 128, 11])
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
from sklearn.metrics import accuracy_score
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):tr_loss, tr_accuracy = 0, 0nb_tr_examples, nb_tr_steps = 0, 0tr_preds, tr_labels = [], []# put model in training modemodel.train()for idx, batch in enumerate(training_loader):ids = batch['ids'].to(device, dtype = torch.long)mask = batch['mask'].to(device, dtype = torch.long)targets = batch['targets'].to(device, dtype = torch.long)outputs = model(input_ids=ids, attention_mask=mask, labels=targets)loss, tr_logits = outputs.loss, outputs.logitstr_loss += loss.item()nb_tr_steps += 1nb_tr_examples += targets.size(0)if idx % 100==0:loss_step = tr_loss/nb_tr_stepsprint(f"Training loss per 100 training steps: {loss_step}")# compute training accuracyflattened_targets = targets.view(-1) # shape (batch_size * seq_len,)active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)targets = torch.masked_select(flattened_targets, active_accuracy)predictions = torch.masked_select(flattened_predictions, active_accuracy)tr_preds.extend(predictions)tr_labels.extend(targets)tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())tr_accuracy += tmp_tr_accuracy# gradient clippingtorch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)# backward passoptimizer.zero_grad()loss.backward()optimizer.step()epoch_loss = tr_loss / nb_tr_stepstr_accuracy = tr_accuracy / nb_tr_stepsprint(f"Training loss epoch: {epoch_loss}")print(f"Training accuracy epoch: {tr_accuracy}")
for epoch in range(EPOCHS):print(f"Training epoch: {epoch + 1}")train(epoch)
Training epoch: 1
Training loss per 100 training steps: 2.4715287685394287
Training loss per 100 training steps: 0.4533584124528536
Training loss per 100 training steps: 0.2905635407277897
Training loss per 100 training steps: 0.22304563949571496
Training loss per 100 training steps: 0.18531145965517906
Training loss per 100 training steps: 0.162208181106952
Training loss per 100 training steps: 0.14587406037737943
Training loss per 100 training steps: 0.13379905450313262
Training loss per 100 training steps: 0.12383504059240129
Training loss per 100 training steps: 0.11645007951776358
Training loss per 100 training steps: 0.10973321026950315
Training loss per 100 training steps: 0.10479672821780005
Training loss per 100 training steps: 0.09999178096184431
Training loss per 100 training steps: 0.09673410547066116
Training loss per 100 training steps: 0.09367919404762295
Training loss per 100 training steps: 0.09046410889920718
Training loss per 100 training steps: 0.08787275739825638
Training loss per 100 training steps: 0.08517808154395627
Training loss epoch: 0.08410522386139234
Training accuracy epoch: 0.928665125621188

模型验证

def valid(model, testing_loader):# put model in evaluation modemodel.eval()eval_loss, eval_accuracy = 0, 0nb_eval_examples, nb_eval_steps = 0, 0eval_preds, eval_labels = [], []with torch.no_grad():for idx, batch in enumerate(testing_loader):ids = batch['ids'].to(device, dtype = torch.long)mask = batch['mask'].to(device, dtype = torch.long)targets = batch['targets'].to(device, dtype = torch.long)outputs = model(input_ids=ids, attention_mask=mask, labels=targets)loss, eval_logits = outputs.loss, outputs.logitseval_loss += loss.item()nb_eval_steps += 1nb_eval_examples += targets.size(0)if idx % 100==0:loss_step = eval_loss/nb_eval_stepsprint(f"Validation loss per 100 evaluation steps: {loss_step}")# compute evaluation accuracyflattened_targets = targets.view(-1) # shape (batch_size * seq_len,)active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)targets = torch.masked_select(flattened_targets, active_accuracy)predictions = torch.masked_select(flattened_predictions, active_accuracy)eval_labels.extend(targets)eval_preds.extend(predictions)tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())eval_accuracy += tmp_eval_accuracy#print(eval_labels)#print(eval_preds)labels = [id2label[id.item()] for id in eval_labels]predictions = [id2label[id.item()] for id in eval_preds]#print(labels)#print(predictions)eval_loss = eval_loss / nb_eval_stepseval_accuracy = eval_accuracy / nb_eval_stepsprint(f"Validation Loss: {eval_loss}")print(f"Validation Accuracy: {eval_accuracy}")return labels, predictions
labels, predictions = valid(model, testing_loader)
Validation loss per 100 evaluation steps: 0.0013093583984300494
Validation loss per 100 evaluation steps: 0.04466064237772791
Validation loss per 100 evaluation steps: 0.04389420640539026
Validation loss per 100 evaluation steps: 0.04578652894750943
Validation Loss: 0.0471943554300529
Validation Accuracy: 0.9498030192637228

NER 指标计算

from seqeval.metrics import classification_reportprint(classification_report([labels], [predictions]))
              precision    recall  f1-score   supportaddress       0.56      0.65      0.60       277company       0.67      0.84      0.75       300government       0.72      0.71      0.72       200name       0.83      0.90      0.86       362
organization       0.68      0.79      0.73       342micro avg       0.69      0.79      0.74      1481macro avg       0.69      0.78      0.73      1481
weighted avg       0.70      0.79      0.74      1481

模型推断

sentence = "我的名字是michal johnson,我的手机号是13425456344,我家住在东北松花江上8幢7单元6楼5号房"inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")# move to gpu
model.to(device)
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token leveltokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)word_level_predictions = []
for pair in wp_preds:if (pair[0].startswith("##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):# skip predictioncontinueelse:word_level_predictions.append(pair[1])# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)
我 的 名 字 是 michal johnson , 我 的 手 机 号 是 13425456344 , 我 家 住 在 东 北 松 花 江 上 8 幢 7 单 元 6 楼 5 号 房
['O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address']
from transformers import pipelinepipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("我的名字是michal johnson,我的手机号是13425456344,我家住在东北松花江上8幢7单元6楼5号房")
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.[{'entity_group': 'name','score': 0.9393858,'word': 'michal johnson','start': 5,'end': 19},{'entity_group': 'address','score': 0.9075842,'word': '东 北 松 花 江 上 8 幢 7 单 元 6 楼 5 号 房','start': 42,'end': 58}]
pipe("我叫王大,喜欢去旺角餐厅吃牛角包, 今年买了阿里巴巴的股票,我家住在新洲花园3栋4单元8988-1室")
[{'entity_group': 'name','score': 0.7752586,'word': '王 大','start': 2,'end': 4},{'entity_group': 'address','score': 0.7672447,'word': '旺 角','start': 8,'end': 10},{'entity_group': 'company','score': 0.9173757,'word': '阿 里 巴 巴','start': 22,'end': 26},{'entity_group': 'address','score': 0.8909252,'word': '新 洲 花 园 3 栋 4 单 元 8988 - 1 室','start': 34,'end': 50}]

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/822003.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

Baumer工业相机堡盟工业相机如何通过NEOAPI SDK使用HDR功能(C#)

Baumer工业相机堡盟工业相机如何通过NEOAPI SDK使用HDR功能&#xff08;C#&#xff09; Baumer工业相机Baumer工业相机通过NEOSDK进行图像压缩的技术背景代码分析第一步&#xff1a;先引用合适的类文件第二步&#xff1a;通过NEOAPI SDK使用HDR高动态范围功能第二步&#xff1a…

设计模式学习(五)——《大话设计模式》

文章目录 设计模式学习&#xff08;五&#xff09;——《大话设计模式》UML类图主要组成元素绘制UML类图的步骤如何绘制好&#xff35;&#xff2d;&#xff2c;类图确定类定义关系使用标准符号添加注释工具选择复审与优化求反馈 UML类图和设计模式的关系根据设计模式绘制UML类…

计算机网络----由概述到ICMP

麻烦先把五个层次刻进DNA里面 应用层 传输层 网络层 数据链路层 物理层 网络层和传输层的区别 网络层:设备到设备 传输层:端口到端口&#xff0c;进程到进程 物理层: 考虑的是怎样才能在连接计算机的传输媒体上传输比特流&#xff0c; 主要考虑的是屏蔽掉不同传输媒体和通信手段…

高级IO和5种IO模型

目录 1. 高级IO1.1 IO的基本概念1.2 OS如何得知外设当中有数据可读取1.3 OS如何处理从网卡中读取到的数据包1.4 IO的步骤 2. 五种IO模型2.1 利用钓鱼来理解2.2 阻塞IO2.3 非阻塞IO2.4 信号驱动IO2.5 IO多路转接2.6 异步IO 3. 高级IO的概念3.1 同步通信 VS 异步通信3.2 阻塞 VS …

k-means聚类算法的MATLAB实现及可视化

K-means算法是一种无监督学习算法&#xff0c;主要用于数据聚类。其工作原理基于迭代优化&#xff0c;将数据点划分为K个集群&#xff0c;使得每个数据点都属于最近的集群&#xff0c;并且每个集群的中心&#xff08;质心&#xff09;是所有属于该集群的数据点的平均值。以下是…

数据仓库元数据管理

数据仓库元数据管理是数据仓库中至关重要的一环&#xff0c;它涉及到对数据仓库中的元数据进行收集、存储、组织、查询、维护和安全管理等方面的工作。本文将介绍数据仓库元数据管理的定义、分类、应用、价值、管理方案、具体实施和挑战&#xff0c;以帮助读者更好地理解和应用…

STM32有什么高速接口吗?

STM32系列微控制器在高速接口方面也提供了一些强大的功能&#xff0c;虽然没有像Zynq那样的可编程逻辑部分&#xff0c;但有一些特性值得注意。我这里有一套嵌入式入门教程&#xff0c;不仅包含了详细的视频 讲解&#xff0c;项目实战。如果你渴望学习嵌入式&#xff0c;不妨点…

【数据结构与算法】用两个栈实现一个队列

题目 用两个栈&#xff0c;实现一个队列功能 add delete length 队列 用数组可以实现队列&#xff0c;数组和队列的区别是&#xff1a;队列是逻辑结构是一个抽象模型&#xff0c;简单地可以用数组、链表实现&#xff0c;所以数组和链表是一个物理结构&#xff0c;队列是一个逻…

【C语言笔记】strncpy()和strcpy()的异同点

文章目录 一&#xff0c;简介二&#xff0c;相同点&#xff1a;2.1 两者都用于将一个字符串复制到另一个字符串中。2.2 它们都以源字符串的结束符 \0 结尾。 三&#xff0c;不同点&#xff1a;3.1 指定复制的最大长度&#xff1a;3.2 处理目标缓冲区溢出的方式&#xff1a;3.3 …

Docker安装SQL Server 2022

官网&#xff1a;Docker&#xff1a;为 Linux 上的 SQL Server 安装容器 - SQL Server | Microsoft Learn 1. 拉取镜像 sudo docker pull mcr.microsoft.com/mssql/server:2022-latest 2. 运行docker容器 方式一&#xff1a;不挂载数据目录 docker run -e "ACCEPT_EUL…

python借助elasticsearch实现标签匹配计数

给定一组标签 [{“tag_id”: “1”, “value”: “西瓜”}, {“tag_id”: “1”, “value”: “苹果”}]&#xff0c;我想精准匹配到现有的标签库中存在的标签并记录匹配成功的数量。 标签id(tag_id)标签名(tag_name)标签值(tag_name )1水果西瓜1水果苹果1水果橙子2动物老虎 …

用python计算一个人的BMI

1 问题 一个人的身高是1.75m和体重是80.5kg&#xff0c;根据BMI公式&#xff08;体重除以身高的平方&#xff09;帮这个人计算他的BMI指数&#xff0c;并根据BMI指数&#xff1a;低于18.5是过轻&#xff0c;18.5-25是正常&#xff0c;25-28是过重&#xff0c;28-32是肥胖&#…

Go 单元测试基本介绍

文章目录 引入一、单元测试基本介绍1.1 什么是单元测试&#xff1f;1.2 如何写好单元测试1.3 单元测试的优点1.4 单元测试的设计原则 二、Go语言测试2.1 Go单元测试概要2.2 Go单元测试基本规范2.3 一个简单例子2.3.1 使用Goland 生成测试文件2.3.2 运行单元测试2.3.3 完善测试用…

easyexcel升级3.3.4失败的经历

原本想通过easyexcel从2.2.6升级到3.3.3解决一部分问题&#xff0c;结果之前的可以用的代码&#xff0c;却无端的出现bug 1 Sheet index (1) is out of range (0…0) 什么都没有改&#xff0c;就出了问题&#xff0c;那么问题肯定出现在easyexcel版本自身.使用模板填充的方式进…

conda新建环境报错An HTTP error occurred when trying to retrieve this URL.

conda新建环境报错如下 cat .condarc #将 .condarc文件中的内容删除&#xff0c;改成下面的内容 vi .condarc channels:- defaults show_channel_urls: true default_channels:- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main- https://mirrors.tuna.tsinghua.…

权限管理Ranger详解

文章目录 一、Ranger概述与安装1、Ranger概述1.1 Ranger介绍1.2 Ranger的目标1.3 Ranger支持的框架1.4 Ranger的架构1.5 Ranger的工作原理 2、Ranger安装2.1 创建系统用户和Kerberos主体2.2 数据库环境准备2.3 安装RangerAdmin2.4 启动RangerAdmin 二、Ranger简单使用1、安装 R…

Cesium之home键开关及相机位置设置

显隐控制 设置代码中的homeButton var TDT_IMG_C "https://{s}.tianditu.gov.cn/img_c/wmts?servicewmts&requestGetTile&version1.0.0" "&LAYERimg&tileMatrixSetc&TileMatrix{TileMatrix}&TileRow{TileRow}&TileCol{TileCol}…

【Java NIO】那NIO为什么速度快?

Java IO在工作中其实不常用到&#xff0c;更别提NIO了。但NIO却是高效操作I/O流的必备技能&#xff0c;如顶级开源项目Kafka、Netty、RocketMQ等都采用了NIO技术&#xff0c;NIO也是大多数面试官必考的体系知识。虽然骨头有点难啃&#xff0c;但还是要慢慢消耗知识、学以致用哈…

# RAG | Langchain # Langchain RAG:打造Markdown文件的结构化分割解决方案

【文章简介】 在信息技术的现代背景下&#xff0c;高效地处理和分析文本数据对于知识获取和决策支持至关重要。Markdown文件因其易读性和高效性&#xff0c;在文档编写和知识共享中占据了重要地位。然而&#xff0c;传统的文本处理方法往往忽视了Markdown的结构化特性&#xff…

KNIME 国际化支持投票

你的投票也许能让 KNIME 中文化快一点点。 i18n 是个很搞笑的单词&#xff0c;它是英文 internationalization 国际化的缩写。18 指的是首字母i和末字母n中间有18个字母。另外还有什么 K8s 也是一样&#xff0c;中间省去了8个字母 ... 真是懒的可以。指北君还想起一个类似的笑话…