bert文本分类微调笔记

Bert实现文本分类微调Demo

import random
from collections import namedtuple'''
有四种文本需要做分类，请使用bert处理这个分类问题
'''# 使用namedtuple定义一个类别(Category)，包含两个字段：名称(name)和样例(samples)
Category = namedtuple('Category', ['name', 'samples'])# 定义四个不同的类别及其对应的样例文本
categories = [Category('Weather Forecast', ['今天北京晴转多云，气温20-25度。', '明天上海有小雨，记得带伞。']),  # 天气预报类别的样例Category('Company Financial Report', ['本季度公司净利润增长20%。', '年度财务报告显示，成本控制良好。']),  # 公司财报类别的样例Category('Company Audit Materials', ['审计发现内部控制存在漏洞。', '审计确认财务报表无重大错报。']),  # 公司审计材料类别的样例Category('Product Marketing Ad', ['新口味可乐，清爽上市！', '买一送一，仅限今日。'])  # 产品营销广告类别的样例
]def generate_data(num_samples_per_category=50):''' 生成模拟数据集输入:- num_samples_per_category: 每个类别生成的样本数量，默认为50输出:- data: 包含文本样本及其对应类别的列表，每项为一个元组(text, label)'''data = []  # 初始化存储数据的列表for category in categories:  # 遍历所有类别for _ in range(num_samples_per_category):  # 对每个类别生成指定数量的样本sample = random.choice(category.samples)  # 从该类别的样例中随机选择一条文本data.append((sample, category.name))  # 将文本及其类别添加到data列表中return data# 调用generate_data函数生成模拟数据集
train_data = generate_data(100)  # 为每个类别生成100个训练样本
test_data = generate_data(6)     # 生成少量（6个）测试样本用于演示'''
train_data = 
[('明天上海有小雨，记得带伞。', 'Weather Forecast'),('明天上海有小雨，记得带伞。', 'Weather Forecast'),('今天北京晴转多云，气温20-25度。', 'Weather Forecast'),('今天北京晴转多云，气温20-25度。', 'Weather Forecast'),('今天北京晴转多云，气温20-25度。', 'Weather Forecast'),('明天上海有小雨，记得带伞。', 'Weather Forecast'),('明天上海有小雨，记得带伞。', 'Weather Forecast'),('明天上海有小雨，记得带伞。', 'Weather Forecast'),('今天北京晴转多云，气温20-25度。', 'Weather Forecast'),]
'''from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F# 步骤1: 定义类别到标签的映射
label_map = {category.name: index for index, category in enumerate(categories)}
num_labels = len(categories)  # 类别总数# 步骤2: 初始化BERT分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)# 步骤3: 准备数据集
def encode_texts(texts, labels):# 对文本进行编码，得到BERT模型需要的输入格式encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')# 将标签名称转换为对应的索引label_ids = torch.tensor([label_map[label] for label in labels])return encodings, label_idsdef prepare_data(data):texts, labels = zip(*data)  # 解压数据encodings, label_ids = encode_texts(texts, labels)  # 编码数据dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], label_ids)  # 创建数据集return DataLoader(dataset, batch_size=8, shuffle=True)  # 创建数据加载器# 步骤4: 准备训练和测试数据
train_loader = prepare_data(train_data)
test_loader = prepare_data(test_data)# 步骤5: 定义训练和评估函数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)def train_epoch(model, data_loader, optimizer):model.train()total_loss = 0for batch in data_loader:optimizer.zero_grad()input_ids, attention_mask, labels = batchinput_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)outputs = model(input_ids, attention_mask=attention_mask, labels=labels)loss = outputs.losstotal_loss += loss.item()loss.backward()optimizer.step()return total_loss / len(data_loader)def evaluate(model, data_loader):model.eval()total_acc = 0total_count = 0with torch.no_grad():for batch in data_loader:input_ids, attention_mask, labels = batchinput_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)outputs = model(input_ids, attention_mask=attention_mask)predictions = torch.argmax(outputs.logits, dim=1)total_acc += (predictions == labels).sum().item()total_count += labels.size(0)return total_acc / total_count# 步骤6: 训练模型
optimizer = AdamW(model.parameters(), lr=2e-5)for epoch in range(3):  # 训练3个epochtrain_loss = train_epoch(model, train_loader, optimizer)acc = evaluate(model, test_loader)print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Test Accuracy: {acc*100:.2f}%')# 步骤7: 使用微调后的模型进行预测
def predict(text):encodings = tokenizer(text, truncation=True, padding=True, return_tensors='pt')input_ids = encodings['input_ids'].to(device)attention_mask = encodings['attention_mask'].to(device)with torch.no_grad():outputs = model(input_ids, attention_mask=attention_mask)predicted_class_id = torch.argmax(outputs.logits).item()return categories[predicted_class_id].name# 预测一个新文本
new_text = ["明天的天气怎么样？"]  # 注意这里是一个列表
predicted_category = predict(new_text)
print(f'The predicted category for the new text is: {predicted_category}')