当前位置: 首页 > news >正文

huggingface利用bert-base-chinese实现中文情感分类

利用pytorch模式

先做一些数据预处理工作,本文主要使用的数据集是lansinuote/ChnSentiCorp

from transformers import BertTokenizer
token = BertTokenizer.from_pretrained('bert-base-chinese')import torch
from datasets import load_datasetdataset = load_dataset('lansinuote/ChnSentiCorp')
print(type(dataset))
class Dataset(torch.utils.data.Dataset):def __init__(self, dataset):self.dataset = datasetdef __len__(self):return len(self.dataset)def __getitem__(self, idx):text = self.dataset[idx]['text']label = self.dataset[idx]['label']return text, label
dataset = Dataset(dataset['train'])device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')def collate_fn(data):sents = [i[0] for i in data]labels = [i[1] for i in data]#编码data = token.batch_encode_plus(batch_text_or_text_pairs=sents,truncation=True,padding='max_length',max_length=500,return_tensors='pt',return_length=True)#input_ids:编码之后的数字#attention_mask:是补零的位置是0,其他位置是1input_ids = data['input_ids'].to(device)attention_mask = data['attention_mask'].to(device)token_type_ids = data['token_type_ids'].to(device)labels = torch.LongTensor(labels).to(device)#print(data['length'], data['length'].max())return input_ids, attention_mask, token_type_ids, labelsloader = torch.utils.data.DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True, drop_last=True)
len(loader)  # 计算数据集的批次数

引入bert-base-chinese模型

from transformers import BertModelpretrained = BertModel.from_pretrained('bert-base-chinese').to(device)
sum(i.numel() for i in pretrained.parameters())/1e6  # 计算模型参数总数for param in pretrained.parameters():param.requires_grad = False  # 冻结参数

模型后面添加几个层

class Model(torch.nn.Module):def __init__(self, pretrained):super(Model, self).__init__()self.bert = pretrainedself.fn1 = torch.nn.Linear(768, 256)self.relu = torch.nn.ReLU()self.fn2 = torch.nn.Linear(256, 768)self.classifier = torch.nn.Linear(768, 2)  # 768是BERT的输出维度,2是分类数def forward(self, input_ids, attention_mask, token_type_ids):with torch.no_grad():output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)#加两个线性层加一个ReLU激活output = self.fn1(output.last_hidden_state[:,0])output = self.relu(output)output = self.fn2(output)out = self.classifier(output)return out

定义训练器

from transformers import AdamW
from transformers.optimization import get_schedulerdef train():optimizer = AdamW(model.parameters(), lr=1e-5)criterion = torch.nn.CrossEntropyLoss()scheduler = get_scheduler("linear", optimizer=optimizer, num_training_steps=len(loader)*3,num_warmup_steps=0)model.train()for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):optimizer.zero_grad()outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)loss = criterion(outputs, labels)loss.backward()optimizer.step()scheduler.step()if i % 10 == 0:out = outputs.argmax(dim=1)accuracy = (out == labels).sum().item() / len(labels)lr = optimizer.state_dict()['param_groups'][0]['lr']print(i, loss.item(), accuracy, lr)

开始训练

train()  # 开始训练

测试

def test():loader_test = torch.utils.data.DataLoader(Dataset(load_dataset('lansinuote/ChnSentiCorp')['test']),batch_size=32,collate_fn=collate_fn,shuffle=True,drop_last=True)model.eval()correct = 0total = 0for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):if i == 5: break  # 只测试前5个批次with torch.no_grad():outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)out = outputs.argmax(dim=1)correct += (out == labels).sum().item()total += len(labels)print('Accuracy:', correct / total)
test()  # 开始测试

利用transformers的工具

数据集是从huggingface下载的,无需进入Dataset类进行额外变换,只需要做一些简单的预处理

import torch
from datasets import load_dataset
dataset = load_dataset('lansinuote/ChnSentiCorp')
dataset['train'] = dataset['train'].shuffle().select(range(2000))
dataset['test'] = dataset['test'].shuffle().select(range(100))
def f(data):return token.batch_encode_plus(data['text'], truncation=True, max_length=512)
dataset = dataset.map(f, batched=True, remove_columns=['text'], batch_size=1000, num_proc=3)
def f(data):return [len(i) <= 512 for i in data['input_ids']]
dataset = dataset.filter(f, batched=True, num_proc=3, batch_size=1000)

引入模型并添加几层

from transformers import BertModelpretrained = BertModel.from_pretrained('bert-base-chinese')
sum(i.numel() for i in pretrained.parameters())/1e6  # 计算模型参数总数
for param in pretrained.parameters():param.requires_grad = False  # 冻结参数import torch
from transformers import BertModelclass Model(torch.nn.Module):def __init__(self, pretrained):super(Model, self).__init__()self.bert = pretrainedself.fn1 = torch.nn.Linear(768, 256)self.relu = torch.nn.ReLU()self.fn2 = torch.nn.Linear(256, 768)self.classifier = torch.nn.Linear(768, 2)  # 768是BERT的输出维度,2是分类数def forward(self, input_ids, attention_mask, token_type_ids, labels=None):with torch.no_grad():output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)cls_output = output.last_hidden_state[:, 0]  # 获取[CLS]的输出output = self.fn1(cls_output)output = self.relu(output)output = self.fn2(output)logits = self.classifier(output)  # 输出 logitsloss = Noneif labels is not None:loss_fn = torch.nn.CrossEntropyLoss()loss = loss_fn(logits, labels)  # 计算损失return (loss, logits) if loss is not None else logits

注意在forward函数中我多加了个参数,labels,因为数据集里面是携带labels的,而且huggingface的特定任务模型也是接受labels这个参数的,如果不加可能不适应huggingface的trainer的调用。

评估函数和训练函数

import evaluate
metric = evaluate.load("accuracy")
import numpy as np
from transformers.trainer_utils import EvalPredictiondef compute_metrics(eval_pred):logits, labels = eval_predpredictions = np.argmax(logits, axis=1)acc = metric.compute(predictions=predictions, references=labels)return acc# 定义训练函数
from transformers import Trainer, TrainingArguments# 参数
training_args = TrainingArguments(output_dir="./output_dir",evaluation_strategy="steps",learning_rate=2e-5,per_device_train_batch_size=16,per_device_eval_batch_size=16,num_train_epochs=2,weight_decay=0.01,eval_steps=20,no_cuda=True,report_to='none',
)
# 训练器
from transformers import Trainer
from transformers import DataCollatorWithPaddingtrainer = Trainer(model=model,args=training_args,train_dataset=dataset['train'],eval_dataset=dataset['test'],data_collator=DataCollatorWithPadding(token),compute_metrics=compute_metrics,
)

训练和评估

trainer.train()
trainer.evaluate()

http://www.mrgr.cn/news/63289.html

相关文章:

  • 《跟我学Spring Boot开发》系列文章索引❤(2025.01.09更新)
  • 深入学习 Python 量化编程
  • 2025最新JAVA面试八股文【基础篇】
  • ansible 检查目录大小
  • Qt中.pro文件中可以填加的宏和其他的信息
  • presto不支持concat_ws
  • 从倍压整流到二极管钳位与限幅
  • Agent 大模型与应用场景之间的桥梁
  • 4路CAN转WiFi网关
  • Caffeine缓存库的LoadingCache:缓存计算或查询结果
  • Verilog HDL学习记录(3~4章)
  • PMP每日一练(二十一)
  • Spring Boot JPA中的Page组件详解
  • JavaScript 入门指南
  • 1. 让我们聊聊 Netty:高性能网络通信库
  • Tita:什么是 360 评估?
  • 计算机低能儿从0刷leetcode | 34.在排序数组中查找元素的第一个和最后一个位置 | 二分法
  • .net 在线客服系统,到底能不能处理 50万 级消息量,系统架构实践
  • HTTP返回码和其含义
  • Vue中ref、reactive、toRef、toRefs的区别
  • 超萌!HTMLCSS:超萌卡通熊猫头
  • 卷积、卷积操作、卷积神经网络原理探索
  • SpringMVC课时1
  • 简单的ELK部署学习
  • 排序——万亿数量级
  • linux基本指令之文件操作