samout v1 预训练模型发布
数据集使用 minimind 数据集
训练代码
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from glob import glob
from tqdm import tqdm
from model import SamOutimport polars as pl
from collections import Counterdef train():voc = pd.read_pickle("total_voc.pkl")net = SamOut(len(voc["voc"]), 512, 32, 8)net.load_state_dict(torch.load("pretrain.pth"))net.to("cuda")opt = torch.optim.Adam(params=net.parameters(), lr=0.00003)loss_func0 = torch.nn.CrossEntropyLoss(ignore_index=3)bar = tqdm(range(20))steps = 0epoch_loss = []for epoch in bar:for one_path in tqdm(glob("./pre_data_set_*.pkl")):data_set = pd.read_pickle(one_path)np.random.shuffle(data_set)loss_list = []for i in range(0, len(data_set), 100):# weights.append(list(net.state_dict().values())[0])j = i + 100input_one = data_set[i:j]out0, _ = net(torch.Tensor(input_one)[:, :-1].int().to("cuda"))loss = loss_func0(out0.reshape([-1, out0.shape[-1]]),torch.Tensor(input_one)[:, 1:].reshape([-1]).long().to("cuda"))loss_list.append(loss.item())bar.set_description("epoch___{}____loss___{:.6f}____steps___{}".format(epoch, np.mean(loss_list), steps))opt.zero_grad()loss.backward()opt.step()steps += 100torch.save(net.state_dict(), "pretrain.pth")# eval_model()epoch_loss.append(np.mean(loss_list))pd.to_pickle(epoch_loss, "loss916")def gen_one_voc():data = pd.read_csv("pretrain_data.csv")data = data["text"].values.tolist()data = "".join(data)count = Counter()for ii in tqdm(range(0, len(data), len(data) // 8)):jj = ii + len(data) // 8for k, v in Counter(data[ii:jj]).items():count[k] = count.get(k, 0) + vdata = ""data0 = pd.read_csv("sft_data_multi.csv")for ii in tqdm(range(0, len(data0), len(data0) // 8)):jj = ii + len(data0) // 8for k, v in Counter(data0[ii:jj]).items():count[k] = count.get(k, 0) + vdata0 = ""data1 = pd.read_csv("sft_data_single.csv")for ii in tqdm(range(0, len(data1), len(data1) // 8)):jj = ii + len(data1) // 8for k, v in Counter(data1[ii:jj]).items():count[k] = count.get(k, 0) + vdata1 = ""# plt.plot(sorted(count.values()))# plt.show()count = pd.DataFrame({"voc": count.keys(), "count": count.values()})voc = count.loc[count["count"] > 100, "voc"].values.tolist()voc0 = [[[["<|pos_{}_{}|>".format(jj, ii) for jj, ii in enumerate(list(str(i)))], j] for i, j inenumerate(count.loc[count["count"] <= 100, "voc"].values.tolist())]]pd.to_pickle(voc, "voc.pkl")pd.to_pickle(voc0, "voc0.pkl")def gen_voc():voc = pd.read_pickle("voc.pkl")voc0 = pd.read_pickle("voc0.pkl")voc0 = {j: i for i, j in voc0[0]}for i in range(6):for j in range(10):voc.append("<|pos_{}_{}|>".format(i, j))voc = ["<|sos|>", "<|user|>", "<|agent|>", "<|pad|>", "<|history|>"] + sorted(voc)pd.to_pickle({"voc": voc, "voc0": voc0}, "total_voc.pkl")def gen_pre_data_align(num, total_num):voc = pd.read_pickle("total_voc.pkl")voc["voc0"] = [[i,[voc["voc"].index(j) for j in ii]] for i,ii in voc["voc0"].items()]voc["voc"]=[i for i in voc["voc"]]voc={"voc": voc["voc"] + [i for i, j in voc["voc0"]],"voc_id": [[i] for i in list(range(len(voc["voc"])))] + [j for i, j in voc["voc0"]]}voc=pd.DataFrame(voc)# voc=pl.DataFrame(voc)pre_data = pl.read_csv("pretrain_data.csv")pre_data = pre_data["text"].to_numpy().tolist()count = len(pre_data) // total_numpre_data = pre_data[(num - 1) * count:count * num]data_set = []bar = tqdm(range(len(pre_data)))while pre_data:bar.update()one = pre_data.pop()one = pd.merge(pd.DataFrame({"voc": list(one)}),voc, on="voc", how="left")thr =np.hstack(one["voc_id"].to_numpy()).tolist()thr += (518 - len(thr)) * [3]thr = thr[:512]data_set.append(thr)pd.to_pickle(data_set, "pre_data_set_{}.pkl".format(num))if __name__ == '__main__':# gen_one_voc()# gen_voc()# for i in range(17,18):# gen_pre_data_align(i, 16)train()
这段代码是一个深度学习项目的训练部分,主要目的是训练一个名为 SamOut
的神经网络模型。以下是代码的主要组成部分和功能:
- 导入必要的库:
matplotlib.pyplot
、numpy
、pandas
、torch
、glob
、tqdm
、polars
、Counter
等库被导入,用于数据处理、模型训练和可视化。
- 定义
train
函数:- 加载词汇表
total_voc.pkl
。 - 初始化
SamOut
模型,并加载预训练权重pretrain.pth
。 - 将模型转移到 GPU 上。
- 定义优化器
Adam
和损失函数CrossEntropyLoss
。 - 使用
tqdm
进度条进行训练,训练 20 个 epoch。 - 在每个 epoch 中,遍历数据集,计算损失,更新模型权重。
- 每个 epoch 结束后,保存模型权重和损失记录。
- 加载词汇表
- 定义
gen_one_voc
函数:- 从 CSV 文件中读取文本数据,计算每个字符的出现频率。
- 根据频率筛选出出现次数大于 100 的字符作为词汇表。
- 保存词汇表到
voc.pkl
。
- 定义
gen_voc
函数:- 加载词汇表
voc.pkl
和voc0.pkl
。 - 将词汇表扩展,包括特殊标记
<|sos|>
,<|eos|>
,<|agent|>
,<|pad|>
,<|history|>
。 - 保存扩展后的词汇表到
total_voc.pkl
。
- 加载词汇表
- 定义
gen_pre_data_align
函数:- 加载扩展后的词汇表
total_voc.pkl
。 - 读取 CSV 文件中的文本数据,将其转换为词汇表的索引表示。
- 将数据集分割成多个部分,每个部分保存为一个 pickle 文件。
- 加载扩展后的词汇表
- 主函数:
- 调用
gen_one_voc
、gen_voc
、gen_pre_data_align
函数生成数据集。 - 调用
train
函数进行模型训练。
总的来说,这段代码的目的是训练一个基于字符的神经网络模型,用于处理文本数据。
- 调用
import torchclass MaxState(torch.nn.Module):def __init__(self, hidden_dim, heads, win):super(MaxState, self).__init__()assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."self.head_size = hidden_dim // headsself.head0 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)# self.h_linear=torch.nn.Parameter(torch.empty(1, 1))# torch.nn.init.xavier_uniform_(self.h_linear,0.5)# self.layer_nor = torch.nn.LayerNorm(hidden_dim)# self.norm = torch.nn.LayerNorm(hidden_dim)# self.alpha = torch.nn.Parameter(torch.tensor(0.5))self.head_num = headsself.hidden = hidden_dimdef forward(self, input_data, state=None):# self.head.to(device)b, s, k, h = input_data.shape[0], input_data.shape[1], self.head_num, self.head_sizeout = self.head0(input_data)out1 = self.head1(input_data)out2 = self.head2(input_data)#out = out.reshape([b, s, k, h]).permute([0, 2, 1, 3])out1 = out1.reshape([b, s, k, h]).permute([0, 2, 1, 3])# out2 = out2.reshape([b, s, k, h]).permute([0, 2, 1, 3])# out1 = self.head1(input_data).reshape([b, s, k, h]).permute([0, 2, 1, 3])out = torch.cummax((out + out1) / h ** 0.5, 2)[0]# out = torch.cummin((out + out1)/k**0.5 , 2)[0]# out_sum = torch.cumsum((out + out1)/k**0.5 , 2)# out=(out-out_min)*outout = out.permute([0, 2, 1, 3])out1 = out1.permute([0, 2, 1, 3])# out2 = out2.permute([0, 2, 1, 3])out = out.reshape([b, s, -1])out1 = out1.reshape([b, s, -1])# out2 = out2.reshape([b, s, -1])# out = self.layer_nor(out)# out = (out + out2) * out+out1# out3=torch.cummax(out,1)[0]out = (out + out2) * out + out1# out = self.alpha * out * (out + out2) + (1 - self.alpha) * out1return out, stateclass KAttention(torch.nn.Module):def __init__(self, hidden_dim, heads):super(KAttention, self).__init__()assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."self.head_size = hidden_dim // headsself.q = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.k = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.v = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)# self.state = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head_num = headsdef forward(self, x, state=None):b, s, h, d = x.shape[0], x.shape[1], self.head_num, self.head_sizeq = self.q(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])k = self.k(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])v = self.v(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])qk = (q @ k.permute([0, 1, 3, 2])) / d ** 0.5mask = torch.triu(torch.ones(s, s).to(device))qk = torch.where(mask.T == 1, qk, torch.Tensor([-float('inf')]).to(device))qkv = torch.nn.functional.softmax(qk, -1) @ v# v + torch.arange(1, 3 * s, 3).reshape([1, 1, -1, 1]).to(device) / s / 3)qkv = qkv.permute([0, 2, 1, 3]).reshape([b, s, -1])#return qkv, stateclass FeedForward(torch.nn.Module):def __init__(self, hidden_size):super(FeedForward, self).__init__()self.ffn1 = torch.nn.Linear(hidden_size, hidden_size * 2)self.ffn2 = torch.nn.Linear(hidden_size * 2, hidden_size)self.gate = torch.nn.Linear(hidden_size, hidden_size * 2)# self.h_linear=torch.nn.Parameter(torch.empty(1, 1))# self.gate = torch.nn.Parameter(torch.empty(hidden_size, hidden_size * 2))# torch.nn.init.xavier_uniform_(self.gate,0.5)self.relu = torch.nn.ReLU()def forward(self, x):x1 = self.ffn1(x)x2 = self.relu(self.gate(x))xx = x1 * x2x = self.ffn2(xx)return xclass DecoderLayer(torch.nn.Module):def __init__(self, hidden_size, num_heads):super(DecoderLayer, self).__init__()# self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads)self.self_attention = MaxState(hidden_size, num_heads, 8)# self.self_attention = KAttention(hidden_size, num_heads)self.ffn = FeedForward(hidden_size)self.layer_norm = torch.nn.LayerNorm(hidden_size)# self.norm = L2Norm()# self.layer_nor = torch.nn.LayerNorm(hidden_dim)# self.norm = torch.nn.LayerNorm(hidden_dim)self.alpha = torch.nn.Parameter(torch.tensor(0.5))# ha = self.norm(self.attention(h))# # 更新输入,包括缩放后的注意力输出# h = self.norm(h + self.attention_scale * (ha - h))# # 对更新后的输入进行多层感知机层的处理并归一化# hm = self.norm(self.mlp(h))# # 最终更新输入,包括缩放后的多层感知机输出# h = self.norm(h + self.mlp_scale * (hm - h))# 返回处理后的结果def forward(self, x, state=None, seq_len=None):x1, state = self.self_attention(x, state)x = self.layer_norm(self.alpha*self.ffn(x1) + (1-self.alpha)*x)return x, stateclass SamOut(torch.nn.Module):def __init__(self, voc_size, hidden_size, num_heads, num_layers):super(SamOut, self).__init__()self.em = torch.nn.Embedding(voc_size, hidden_size, padding_idx=3)self.pos = torch.nn.Embedding(1024, hidden_size)self.decoder_layers = torch.nn.ModuleList([DecoderLayer(hidden_size, num_heads) for _ in range(num_layers)])self.head = torch.nn.Linear(hidden_size, voc_size, False)# self.head_state = torch.nn.Linear(hidden_size, num_layers, False)self.down = torch.nn.ModuleList([torch.nn.Linear(2 * hidden_size, hidden_size, False) for _ in range(num_layers)])# self.down = torch.nn.Linear(2 * hidden_size, hidden_size, False)def state_forward(self, state, pos, x):if state is None:state = [None] * len(self.decoder_layers)i = 0for ii, decoder_layer in enumerate(self.decoder_layers):x = self.down[i](torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))# x = self.down[i](torch.concat([x2, x], -1))# x = self.down(torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))x1, state[i] = decoder_layer(x, state[i])x = x1 + xi += 1return x, statedef pos_forward(self, x):if x.shape[1] >= 1024:pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) // 1024).unsqueeze(0)pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) % 1024).unsqueeze(0) + poselse:pos = self.pos(torch.arange(0, x.shape[1]).long().to(device)).unsqueeze(0)return posdef forward(self, x0):x0, _ = self.one_forward(x0, state=None)return x0, _def one_forward(self, x, state=None, seq_len=None):x = self.em(x)pos = self.pos_forward(x)x, state = self.state_forward(state, pos, x)return self.head(x), statedevice = "cuda"
if __name__ == '__main__':net = SamOut(235, 256, 16, 4)net.to(device)net(torch.randint(0, 200, [2, 8 * 13]).to(device))#