python-pytorc+bert句子分类0.1.000

这里写目录标题

- 引入包
- 加载预训练模型
- 加载数据文件
- 定义数据
- 实例化数据集
- 使用loader加载数据
- - 设定最大句子长度
  - 定义加padding的函数
  - 定义加collate_fn函数
  - 使用DataLoader加载数据
- 定义模型
- - 测试预训练模型输出
  - 测试预训练模型输出
  - 定义自己的模型
- 参考

引入包

import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

加载预训练模型

from transformers import BertTokenizer,BertForSequenceClassification,BertConfig
config=BertConfig.from_pretrained("D:\\jpdir\\bert\\bertchinese",num_labels=10)
tokenizer = BertTokenizer.from_pretrained("D:\\jpdir\\bert\\bertchinese")
model = BertForSequenceClassification.from_pretrained("D:\\jpdir\\bert\\bertchinese",config=config)

d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'._np_qint8 = np.dtype([("qint8", np.int8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'._np_quint8 = np.dtype([("quint8", np.uint8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'._np_qint16 = np.dtype([("qint16", np.int16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'._np_quint16 = np.dtype([("quint16", np.uint16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'._np_qint32 = np.dtype([("qint32", np.int32, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.np_resource = np.dtype([("resource", np.ubyte, 1)])

加载数据文件

# 整理训练数据
x_train=[]
x_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt","r",encoding="utf-8") as f:lines=f.readlines()for line in lines:x_train.append(line.split("\t")[0])x_test.append(line.split("\t")[1].replace("\n",""))# 整理测试数据
y_train=[]
y_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt","r",encoding="utf-8") as f:lines=f.readlines()for line in lines:y_train.append(line.split("\t")[0])y_test.append(line.split("\t")[1].replace("\n",""))

定义数据

class CustomDataset(Dataset):def __init__(self,data_path):# 初始化数据集的过程，例如加载数据等# 假设我们有一个数据列表self.data = []with open(data_path,"r",encoding="utf-8") as f:lines=f.readlines()for line in lines:self.data.append(line)def __len__(self):# 返回数据集的长度return len(self.data)def __getitem__(self, index):# 根据索引获取一个样本line=self.data[index]content=line.split("\t")[0]label=line.split("\t")[1].replace("\n","").replace("\"","")return content,label

实例化数据集

train_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt")
test_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt")
len(train_data),len(test_data)

(4610, 4768)

使用loader加载数据

设定最大句子长度

maxlenhth=32

定义加padding的函数

不够maxlength，就加pad，这的pad对应的索引是0

def add_padding(data):if len(data)<maxlenhth:for x in torch.arange(maxlenhth-len(data)):data.append(0)return data

定义加collate_fn函数

这里处理tokenizer和paading

def collate_fn(batchData,tokenizer):scentence=[line[0] for line in batchData]label=[int(line[1]) for line in batchData]scentence=torch.tensor([add_padding(tokenizer.encode(one,max_length=32,add_special_tokens=True)) for one in scentence])label=torch.tensor(label)return scentence,label

使用DataLoader加载数据

loader = DataLoader(train_data, 5, shuffle=True,collate_fn=lambda x:collate_fn(x,tokenizer))
data_iter = iter(loader)
print(len(data_iter))# 看下数据
data = next(data_iter)
"长度：",len(data[0]),"data[0]:",data[0],"data[1]:",data[1],"data:",data,data[0].size(),data[1].unsqueeze(1).size()

922('长度：',5,'data[0]:',tensor([[ 101,  517,  682, 1957, 3187, 3127,  518, 3119, 6228, 1086, 1932, 1094,3209, 3241,  677, 4028, 1920, 5310, 2229,  102,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 2349, 7561,  680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521,  102,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629,  122,118,  124, 2233, 1762, 1545, 1059, 3621, 8380, 2835,  102,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 3791, 1744, 8226,  674,  782, 7770, 5440,  868, 3152, 1091,  100,3152, 1265, 3221, 1415,  886,  782, 2814, 3289,  100,  102,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101,  517, 7987,  722, 6484,  518,  100,  100, 2845, 1399, 2661, 5683,2458, 1423,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0]]),'data[1]:',tensor([9, 7, 1, 3, 8]),'data:',(tensor([[ 101,  517,  682, 1957, 3187, 3127,  518, 3119, 6228, 1086, 1932, 1094,3209, 3241,  677, 4028, 1920, 5310, 2229,  102,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 2349, 7561,  680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521,  102,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629,  122,118,  124, 2233, 1762, 1545, 1059, 3621, 8380, 2835,  102,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101, 3791, 1744, 8226,  674,  782, 7770, 5440,  868, 3152, 1091,  100,3152, 1265, 3221, 1415,  886,  782, 2814, 3289,  100,  102,    0,    0,0,    0,    0,    0,    0,    0,    0,    0],[ 101,  517, 7987,  722, 6484,  518,  100,  100, 2845, 1399, 2661, 5683,2458, 1423,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,0,    0,    0,    0,    0,    0,    0,    0]]),tensor([9, 7, 1, 3, 8])),torch.Size([5, 32]),torch.Size([5, 1]))

定义模型

测试预训练模型输出

BertForSequenceClassification的输入input_ids size是[batch_size,maxlength],labels的size是[batch_size,1]
input_ids 是中文转成设定的数字
lables是数据的分类标签

测试预训练模型输出

loss 损失值
logits 概率分布

input_ids = torch.tensor(tokenizer.encode("词汇阅读是关键 08年考研暑期英语复习全指南",max_length=32,add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)
print(outputs)
loss, logits = outputs
loss, logits

(tensor(2.2565, grad_fn=<NllLossBackward0>), tensor([[ 0.5478, -0.0462, -0.2125, -0.8165,  0.1208, -0.4684, -0.9593,  0.4391,0.1320, -1.0400]], grad_fn=<AddmmBackward0>))(tensor(2.2565, grad_fn=<NllLossBackward0>),tensor([[ 0.5478, -0.0462, -0.2125, -0.8165,  0.1208, -0.4684, -0.9593,  0.4391,0.1320, -1.0400]], grad_fn=<AddmmBackward0>))

定义自己的模型

# Define model
class NeuralNetwork(nn.Module):def __init__(self):super().__init__()self.flatten = nn.Flatten()self.linear_relu_stack = nn.Sequential(nn.Linear(28*28, 512),nn.ReLU(),nn.Linear(512, 512),nn.ReLU(),nn.Linear(512, 10))def forward(self, x):x = self.flatten(x)logits = self.linear_relu_stack(x)return logitsmodel1 = NeuralNetwork()
print(model1)

NeuralNetwork((flatten): Flatten(start_dim=1, end_dim=-1)(linear_relu_stack): Sequential((0): Linear(in_features=784, out_features=512, bias=True)(1): ReLU()(2): Linear(in_features=512, out_features=512, bias=True)(3): ReLU()(4): Linear(in_features=512, out_features=10, bias=True))
)

optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

model.train()for i,batch in enumerate(loader):optimizer.zero_grad()scentenses,labels=batchoutput=model(scentenses,labels=labels.unsqueeze(1))loss,logits=outputloss.backward()optimizer.step()print(i,loss.item())