当前位置: 首页 > news >正文

samout v1 预训练模型发布

数据集使用 minimind 数据集

训练代码

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from glob import glob
from tqdm import tqdm
from model import SamOutimport polars as pl
from collections import Counterdef train():voc = pd.read_pickle("total_voc.pkl")net = SamOut(len(voc["voc"]), 512, 32, 8)net.load_state_dict(torch.load("pretrain.pth"))net.to("cuda")opt = torch.optim.Adam(params=net.parameters(), lr=0.00003)loss_func0 = torch.nn.CrossEntropyLoss(ignore_index=3)bar = tqdm(range(20))steps = 0epoch_loss = []for epoch in bar:for one_path in tqdm(glob("./pre_data_set_*.pkl")):data_set = pd.read_pickle(one_path)np.random.shuffle(data_set)loss_list = []for i in range(0, len(data_set), 100):# weights.append(list(net.state_dict().values())[0])j = i + 100input_one = data_set[i:j]out0, _ = net(torch.Tensor(input_one)[:, :-1].int().to("cuda"))loss = loss_func0(out0.reshape([-1, out0.shape[-1]]),torch.Tensor(input_one)[:, 1:].reshape([-1]).long().to("cuda"))loss_list.append(loss.item())bar.set_description("epoch___{}____loss___{:.6f}____steps___{}".format(epoch, np.mean(loss_list), steps))opt.zero_grad()loss.backward()opt.step()steps += 100torch.save(net.state_dict(), "pretrain.pth")# eval_model()epoch_loss.append(np.mean(loss_list))pd.to_pickle(epoch_loss, "loss916")def gen_one_voc():data = pd.read_csv("pretrain_data.csv")data = data["text"].values.tolist()data = "".join(data)count = Counter()for ii in tqdm(range(0, len(data), len(data) // 8)):jj = ii + len(data) // 8for k, v in Counter(data[ii:jj]).items():count[k] = count.get(k, 0) + vdata = ""data0 = pd.read_csv("sft_data_multi.csv")for ii in tqdm(range(0, len(data0), len(data0) // 8)):jj = ii + len(data0) // 8for k, v in Counter(data0[ii:jj]).items():count[k] = count.get(k, 0) + vdata0 = ""data1 = pd.read_csv("sft_data_single.csv")for ii in tqdm(range(0, len(data1), len(data1) // 8)):jj = ii + len(data1) // 8for k, v in Counter(data1[ii:jj]).items():count[k] = count.get(k, 0) + vdata1 = ""# plt.plot(sorted(count.values()))# plt.show()count = pd.DataFrame({"voc": count.keys(), "count": count.values()})voc = count.loc[count["count"] > 100, "voc"].values.tolist()voc0 = [[[["<|pos_{}_{}|>".format(jj, ii) for jj, ii in enumerate(list(str(i)))], j] for i, j inenumerate(count.loc[count["count"] <= 100, "voc"].values.tolist())]]pd.to_pickle(voc, "voc.pkl")pd.to_pickle(voc0, "voc0.pkl")def gen_voc():voc = pd.read_pickle("voc.pkl")voc0 = pd.read_pickle("voc0.pkl")voc0 = {j: i for i, j in voc0[0]}for i in range(6):for j in range(10):voc.append("<|pos_{}_{}|>".format(i, j))voc = ["<|sos|>", "<|user|>", "<|agent|>", "<|pad|>", "<|history|>"] + sorted(voc)pd.to_pickle({"voc": voc, "voc0": voc0}, "total_voc.pkl")def gen_pre_data_align(num, total_num):voc = pd.read_pickle("total_voc.pkl")voc["voc0"] = [[i,[voc["voc"].index(j) for j in ii]] for i,ii in voc["voc0"].items()]voc["voc"]=[i for i in voc["voc"]]voc={"voc": voc["voc"] + [i for i, j in voc["voc0"]],"voc_id": [[i] for i in list(range(len(voc["voc"])))] + [j for i, j in voc["voc0"]]}voc=pd.DataFrame(voc)# voc=pl.DataFrame(voc)pre_data = pl.read_csv("pretrain_data.csv")pre_data = pre_data["text"].to_numpy().tolist()count = len(pre_data) // total_numpre_data = pre_data[(num - 1) * count:count * num]data_set = []bar = tqdm(range(len(pre_data)))while pre_data:bar.update()one = pre_data.pop()one = pd.merge(pd.DataFrame({"voc": list(one)}),voc, on="voc", how="left")thr =np.hstack(one["voc_id"].to_numpy()).tolist()thr += (518 - len(thr)) * [3]thr = thr[:512]data_set.append(thr)pd.to_pickle(data_set, "pre_data_set_{}.pkl".format(num))if __name__ == '__main__':# gen_one_voc()# gen_voc()# for i in range(17,18):#     gen_pre_data_align(i, 16)train()

这段代码是一个深度学习项目的训练部分,主要目的是训练一个名为 SamOut 的神经网络模型。以下是代码的主要组成部分和功能:

  1. 导入必要的库
    • matplotlib.pyplotnumpypandastorchglobtqdmpolarsCounter 等库被导入,用于数据处理、模型训练和可视化。
  2. 定义 train 函数
    • 加载词汇表 total_voc.pkl
    • 初始化 SamOut 模型,并加载预训练权重 pretrain.pth
    • 将模型转移到 GPU 上。
    • 定义优化器 Adam 和损失函数 CrossEntropyLoss
    • 使用 tqdm 进度条进行训练,训练 20 个 epoch。
    • 在每个 epoch 中,遍历数据集,计算损失,更新模型权重。
    • 每个 epoch 结束后,保存模型权重和损失记录。
  3. 定义 gen_one_voc 函数
    • 从 CSV 文件中读取文本数据,计算每个字符的出现频率。
    • 根据频率筛选出出现次数大于 100 的字符作为词汇表。
    • 保存词汇表到 voc.pkl
  4. 定义 gen_voc 函数
    • 加载词汇表 voc.pklvoc0.pkl
    • 将词汇表扩展,包括特殊标记 <|sos|>, <|eos|>, <|agent|>, <|pad|>, <|history|>
    • 保存扩展后的词汇表到 total_voc.pkl
  5. 定义 gen_pre_data_align 函数
    • 加载扩展后的词汇表 total_voc.pkl
    • 读取 CSV 文件中的文本数据,将其转换为词汇表的索引表示。
    • 将数据集分割成多个部分,每个部分保存为一个 pickle 文件。
  6. 主函数
    • 调用 gen_one_vocgen_vocgen_pre_data_align 函数生成数据集。
    • 调用 train 函数进行模型训练。
      总的来说,这段代码的目的是训练一个基于字符的神经网络模型,用于处理文本数据。
import torchclass MaxState(torch.nn.Module):def __init__(self, hidden_dim, heads, win):super(MaxState, self).__init__()assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."self.head_size = hidden_dim // headsself.head0 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)# self.h_linear=torch.nn.Parameter(torch.empty(1, 1))# torch.nn.init.xavier_uniform_(self.h_linear,0.5)# self.layer_nor = torch.nn.LayerNorm(hidden_dim)# self.norm = torch.nn.LayerNorm(hidden_dim)# self.alpha = torch.nn.Parameter(torch.tensor(0.5))self.head_num = headsself.hidden = hidden_dimdef forward(self, input_data, state=None):# self.head.to(device)b, s, k, h = input_data.shape[0], input_data.shape[1], self.head_num, self.head_sizeout = self.head0(input_data)out1 = self.head1(input_data)out2 = self.head2(input_data)#out = out.reshape([b, s, k, h]).permute([0, 2, 1, 3])out1 = out1.reshape([b, s, k, h]).permute([0, 2, 1, 3])# out2 = out2.reshape([b, s, k, h]).permute([0, 2, 1, 3])# out1 = self.head1(input_data).reshape([b, s, k, h]).permute([0, 2, 1, 3])out = torch.cummax((out + out1) / h ** 0.5, 2)[0]# out = torch.cummin((out + out1)/k**0.5 , 2)[0]# out_sum = torch.cumsum((out + out1)/k**0.5 , 2)# out=(out-out_min)*outout = out.permute([0, 2, 1, 3])out1 = out1.permute([0, 2, 1, 3])# out2 = out2.permute([0, 2, 1, 3])out = out.reshape([b, s, -1])out1 = out1.reshape([b, s, -1])# out2 = out2.reshape([b, s, -1])# out = self.layer_nor(out)# out = (out + out2) * out+out1# out3=torch.cummax(out,1)[0]out = (out + out2) * out + out1# out = self.alpha * out * (out + out2) + (1 - self.alpha) * out1return out, stateclass KAttention(torch.nn.Module):def __init__(self, hidden_dim, heads):super(KAttention, self).__init__()assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."self.head_size = hidden_dim // headsself.q = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.k = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.v = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)# self.state = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)self.head_num = headsdef forward(self, x, state=None):b, s, h, d = x.shape[0], x.shape[1], self.head_num, self.head_sizeq = self.q(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])k = self.k(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])v = self.v(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])qk = (q @ k.permute([0, 1, 3, 2])) / d ** 0.5mask = torch.triu(torch.ones(s, s).to(device))qk = torch.where(mask.T == 1, qk, torch.Tensor([-float('inf')]).to(device))qkv = torch.nn.functional.softmax(qk, -1) @ v#             v + torch.arange(1, 3 * s, 3).reshape([1, 1, -1, 1]).to(device) / s / 3)qkv = qkv.permute([0, 2, 1, 3]).reshape([b, s, -1])#return qkv, stateclass FeedForward(torch.nn.Module):def __init__(self, hidden_size):super(FeedForward, self).__init__()self.ffn1 = torch.nn.Linear(hidden_size, hidden_size * 2)self.ffn2 = torch.nn.Linear(hidden_size * 2, hidden_size)self.gate = torch.nn.Linear(hidden_size, hidden_size * 2)# self.h_linear=torch.nn.Parameter(torch.empty(1, 1))# self.gate  = torch.nn.Parameter(torch.empty(hidden_size,  hidden_size * 2))# torch.nn.init.xavier_uniform_(self.gate,0.5)self.relu = torch.nn.ReLU()def forward(self, x):x1 = self.ffn1(x)x2 = self.relu(self.gate(x))xx = x1 * x2x = self.ffn2(xx)return xclass DecoderLayer(torch.nn.Module):def __init__(self, hidden_size, num_heads):super(DecoderLayer, self).__init__()# self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads)self.self_attention = MaxState(hidden_size, num_heads, 8)# self.self_attention = KAttention(hidden_size, num_heads)self.ffn = FeedForward(hidden_size)self.layer_norm = torch.nn.LayerNorm(hidden_size)# self.norm = L2Norm()# self.layer_nor = torch.nn.LayerNorm(hidden_dim)# self.norm = torch.nn.LayerNorm(hidden_dim)self.alpha = torch.nn.Parameter(torch.tensor(0.5))# ha = self.norm(self.attention(h))# # 更新输入,包括缩放后的注意力输出# h = self.norm(h + self.attention_scale * (ha - h))# # 对更新后的输入进行多层感知机层的处理并归一化# hm = self.norm(self.mlp(h))# # 最终更新输入,包括缩放后的多层感知机输出# h = self.norm(h + self.mlp_scale * (hm - h))# 返回处理后的结果def forward(self, x, state=None, seq_len=None):x1, state = self.self_attention(x, state)x = self.layer_norm(self.alpha*self.ffn(x1) + (1-self.alpha)*x)return x, stateclass SamOut(torch.nn.Module):def __init__(self, voc_size, hidden_size, num_heads, num_layers):super(SamOut, self).__init__()self.em = torch.nn.Embedding(voc_size, hidden_size, padding_idx=3)self.pos = torch.nn.Embedding(1024, hidden_size)self.decoder_layers = torch.nn.ModuleList([DecoderLayer(hidden_size, num_heads) for _ in range(num_layers)])self.head = torch.nn.Linear(hidden_size, voc_size, False)# self.head_state = torch.nn.Linear(hidden_size, num_layers, False)self.down = torch.nn.ModuleList([torch.nn.Linear(2 * hidden_size, hidden_size, False) for _ in range(num_layers)])# self.down = torch.nn.Linear(2 * hidden_size, hidden_size, False)def state_forward(self, state, pos, x):if state is None:state = [None] * len(self.decoder_layers)i = 0for ii, decoder_layer in enumerate(self.decoder_layers):x = self.down[i](torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))# x = self.down[i](torch.concat([x2, x], -1))# x = self.down(torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))x1, state[i] = decoder_layer(x, state[i])x = x1 + xi += 1return x, statedef pos_forward(self, x):if x.shape[1] >= 1024:pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) // 1024).unsqueeze(0)pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) % 1024).unsqueeze(0) + poselse:pos = self.pos(torch.arange(0, x.shape[1]).long().to(device)).unsqueeze(0)return posdef forward(self, x0):x0, _ = self.one_forward(x0, state=None)return x0, _def one_forward(self, x, state=None, seq_len=None):x = self.em(x)pos = self.pos_forward(x)x, state = self.state_forward(state, pos, x)return self.head(x), statedevice = "cuda"
if __name__ == '__main__':net = SamOut(235, 256, 16, 4)net.to(device)net(torch.randint(0, 200, [2, 8 * 13]).to(device))#

http://www.mrgr.cn/news/66398.html

相关文章:

  • WeThinkIn | 谷歌科学家万字长文:《改变你职业生涯的一篇文章,我如何运用人工智能完成工作》建议每个人都要读一遍(已收藏)!
  • 个人开发三步走
  • 【软考】Redis不同的数据类型和应用场景。
  • 无人机干扰与抗干扰,无人机与反制设备的矛与盾
  • 串的定义和基本操作
  • PySpark任务提交
  • yolov8模型推理测试代码(pt/onnx)
  • 好用的办公套件--- ONLYOFFICE
  • ubuntu问题 -- ubuntu图形化桌面突然打不开了, 一开机黑屏, 或者直接就是login登录的tty命令行界面
  • 有效增加网站流量的实用策略和技巧
  • 影像拼接线生成代码实现
  • 如何检查雷池社区版 WAF 是否安装成功?
  • 数论——约数(完整版)
  • 【商用存储】希捷磁盘阵列部署实践
  • 印刷质量检测笔记
  • 总线(概述、事务和定时)
  • 前沿吃瓜:如何看待linux社区将俄罗斯的linux贡献者“逐出”社区
  • Mybatis和Hibernate
  • Meta VR硬件主管强势加入OpenAI,与苹果传奇设计师合作开发新AI设备
  • 02- 模块化编程-005 MAX1241数码显示
  • 配置深度学习环境
  • pdf添加目录标签python(手动配置)
  • dockerdockerfiledocker-compose操作nginx
  • MMBench-Video:上海 AI Lab 联合多所高校推出长视频理解基准测试工具,全面评估 LVLMs 视频理解的能力
  • 远程操作Linux服务器 _Xshell、Xftp以及Linux常见操作命令
  • 不要只知道deepl翻译,这里有10个专业好用的翻译工具等着你。