当前位置：首页 > news >正文

深度学习模块创作（缝合）教程|适合1-360月小宝宝食用，干货满满

news 2025/7/15 3:31:09

本文为深度学习模块的缝合教程，欢迎纠正补充

在创作深度学习模块之前，首先要有配置一个深度学习的环境（本文教程基于pytorch），可按照下方连接配置，本文不再赘述。

深度学习（Pytorch）环境配置_pytorch配置-CSDN博客

为照顾刚入门的新手同学，这里我们先自己创作一个简单的深度学习模块，模块要求连续进行两次卷积并带有Resnet结构，最好能控制中间层通道（c_）的大小，代码如下：

class Block(nn.Module):def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):super().__init__()c_ = int(c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 3, 1)self.cv2 = Conv(c_, c2, 3, 1, g=g)self.add = shortcut and c1 == c2def forward(self, x):return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))

如上方代码，深度学习模块继承自（nn.Module类），通常在构造函数（__init__）中定义该模块中包含的子模块，在forwad函数中定义执行顺序，结构如下图。

有经验的小伙伴可能已经看出来了，这个模块就是C2f与C3模块中的Bottleneck模块。了解深度学习模块定义之后再看下面如何使用模块改进网络来写论文。

改进网络模块由浅到深可以分为三个层次，第一个层次是将其他方向别人的创作模块放到自己的中，简单的融合其他模块，比如换个主干、卷积等等，现在这种方式只推荐在深度学习还没有普及的领域使用。第二种层次也是论文用的最多的层次，就是将已经有的模块融入到现有的其他模块中，组成新的模块，像是增加、替换模块或结构。下面以Bottleneck模块与AKConv为例演示几种融合代码。

class Block_AKConv(nn.Module):def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):super().__init__()c_ = int(c2 * e)  # hidden channelsself.cv1 = AKConv(c1, c_, 1, 5)self.cv2 = Conv(c_, c2, 3, 1, g=g)self.add = shortcut and c1 == c2def forward(self, x):return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))class AKConv(nn.Module):def __init__(self, c1, c2, stride=1, num_param=5):"""初始化参数说明：inc: 输入通道数, outc: 输出通道数, num_param:（卷积核）参数量, stride = 1:卷积步长默认为1, bias = None：默认无偏执"""super(AKConv, self).__init__()self.num_param = num_paramself.stride = strideself.conv = Conv(c1, c2, k=(num_param, 1), s=(num_param, 1))self.p_conv = nn.Conv2d(c1, 2 * num_param, kernel_size=3, padding=1, stride=stride)nn.init.constant_(self.p_conv.weight, 0)self.p_conv.register_full_backward_hook(self._set_lr)@staticmethoddef _set_lr(module, grad_input, grad_output):grad_input = (grad_input[i] * 0.1 for i in range(len(grad_input)))grad_output = (grad_output[i] * 0.1 for i in range(len(grad_output)))def forward(self, x):# N is num_param.offset = self.p_conv(x)dtype = offset.data.type()N = offset.size(1) // 2# (b, 2N, h, w)p = self._get_p(offset, dtype)# (b, h, w, 2N)p = p.contiguous().permute(0, 2, 3, 1)q_lt = p.detach().floor()q_rb = q_lt + 1q_lt = torch.cat([torch.clamp(q_lt[..., :N], 0, x.size(2) - 1), torch.clamp(q_lt[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_rb = torch.cat([torch.clamp(q_rb[..., :N], 0, x.size(2) - 1), torch.clamp(q_rb[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_lb = torch.cat([q_lt[..., :N], q_rb[..., N:]], dim=-1)q_rt = torch.cat([q_rb[..., :N], q_lt[..., N:]], dim=-1)# clip pp = torch.cat([torch.clamp(p[..., :N], 0, x.size(2) - 1), torch.clamp(p[..., N:], 0, x.size(3) - 1)], dim=-1)# bilinear kernel (b, h, w, N)g_lt = (1 + (q_lt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_lt[..., N:].type_as(p) - p[..., N:]))g_rb = (1 - (q_rb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_rb[..., N:].type_as(p) - p[..., N:]))g_lb = (1 + (q_lb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_lb[..., N:].type_as(p) - p[..., N:]))g_rt = (1 - (q_rt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_rt[..., N:].type_as(p) - p[..., N:]))# resampling the features based on the modified coordinates.x_q_lt = self._get_x_q(x, q_lt, N)x_q_rb = self._get_x_q(x, q_rb, N)x_q_lb = self._get_x_q(x, q_lb, N)x_q_rt = self._get_x_q(x, q_rt, N)# bilinearx_offset = g_lt.unsqueeze(dim=1) * x_q_lt + \g_rb.unsqueeze(dim=1) * x_q_rb + \g_lb.unsqueeze(dim=1) * x_q_lb + \g_rt.unsqueeze(dim=1) * x_q_rtx_offset = self._reshape_x_offset(x_offset, self.num_param)out = self.conv(x_offset)return out# generating the inital sampled shapes for the AKConv with different sizes.def _get_p_n(self, N, dtype):base_int = round(math.sqrt(self.num_param))row_number = self.num_param // base_intmod_number = self.num_param % base_intp_n_x, p_n_y = torch.meshgrid(torch.arange(0, row_number),torch.arange(0, base_int), indexing='xy')p_n_x = torch.flatten(p_n_x)p_n_y = torch.flatten(p_n_y)if mod_number > 0:mod_p_n_x, mod_p_n_y = torch.meshgrid(torch.arange(row_number, row_number + 1),torch.arange(0, mod_number), indexing='xy')mod_p_n_x = torch.flatten(mod_p_n_x)mod_p_n_y = torch.flatten(mod_p_n_y)p_n_x, p_n_y = torch.cat((p_n_x, mod_p_n_x)), torch.cat((p_n_y, mod_p_n_y))p_n = torch.cat([p_n_x, p_n_y], 0)p_n = p_n.view(1, 2 * N, 1, 1).type(dtype)return p_n# no zero-paddingdef _get_p_0(self, h, w, N, dtype):p_0_x, p_0_y = torch.meshgrid(torch.arange(0, h * self.stride, self.stride),torch.arange(0, w * self.stride, self.stride), indexing='xy')p_0_x = torch.flatten(p_0_x).view(1, 1, h, w).repeat(1, N, 1, 1)p_0_y = torch.flatten(p_0_y).view(1, 1, h, w).repeat(1, N, 1, 1)p_0 = torch.cat([p_0_x, p_0_y], 1).type(dtype)return p_0def _get_p(self, offset, dtype):N, h, w = offset.size(1) // 2, offset.size(2), offset.size(3)# (1, 2N, 1, 1)p_n = self._get_p_n(N, dtype)# (1, 2N, h, w)p_0 = self._get_p_0(h, w, N, dtype)p = p_0 + p_n + offsetreturn pdef _get_x_q(self, x, q, N):b, h, w, _ = q.size()padded_w = x.size(3)c = x.size(1)# (b, c, h*w)x = x.contiguous().view(b, c, -1)# (b, h, w, N)index = q[..., :N] * padded_w + q[..., N:]  # offset_x*w + offset_y# (b, c, h*w*N)index = index.contiguous().unsqueeze(dim=1).expand(-1, c, -1, -1, -1).contiguous().view(b, c, -1)index = index.clamp(min=0, max=x.shape[-1] - 1)x_offset = x.gather(dim=-1, index=index).contiguous().view(b, c, h, w, N)return x_offset#  Stacking resampled features in the row direction.@staticmethoddef _reshape_x_offset(x_offset, num_param):b, c, h, w, n = x_offset.size()x_offset = rearrange(x_offset, 'b c h w n -> b c (h n) w')return x_offset

结构图1：

class Block_AKConv(nn.Module):def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):super().__init__()c_ = int(c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 3, 1)self.md = AKConv(c1, c_, 1, 5)self.cv2 = Conv(c_, c2, 3, 1, g=g)self.add = shortcut and c1 == c2def forward(self, x):return x + self.cv2(self.md(self.cv1(x))) if self.add else self.cv2(self.md(self.cv1(x)))class AKConv(nn.Module):def __init__(self, c1, c2, stride=1, num_param=5):"""初始化参数说明：inc: 输入通道数, outc: 输出通道数, num_param:（卷积核）参数量, stride = 1:卷积步长默认为1, bias = None：默认无偏执"""super(AKConv, self).__init__()self.num_param = num_paramself.stride = strideself.conv = Conv(c1, c2, k=(num_param, 1), s=(num_param, 1))self.p_conv = nn.Conv2d(c1, 2 * num_param, kernel_size=3, padding=1, stride=stride)nn.init.constant_(self.p_conv.weight, 0)self.p_conv.register_full_backward_hook(self._set_lr)@staticmethoddef _set_lr(module, grad_input, grad_output):grad_input = (grad_input[i] * 0.1 for i in range(len(grad_input)))grad_output = (grad_output[i] * 0.1 for i in range(len(grad_output)))def forward(self, x):# N is num_param.offset = self.p_conv(x)dtype = offset.data.type()N = offset.size(1) // 2# (b, 2N, h, w)p = self._get_p(offset, dtype)# (b, h, w, 2N)p = p.contiguous().permute(0, 2, 3, 1)q_lt = p.detach().floor()q_rb = q_lt + 1q_lt = torch.cat([torch.clamp(q_lt[..., :N], 0, x.size(2) - 1), torch.clamp(q_lt[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_rb = torch.cat([torch.clamp(q_rb[..., :N], 0, x.size(2) - 1), torch.clamp(q_rb[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_lb = torch.cat([q_lt[..., :N], q_rb[..., N:]], dim=-1)q_rt = torch.cat([q_rb[..., :N], q_lt[..., N:]], dim=-1)# clip pp = torch.cat([torch.clamp(p[..., :N], 0, x.size(2) - 1), torch.clamp(p[..., N:], 0, x.size(3) - 1)], dim=-1)# bilinear kernel (b, h, w, N)g_lt = (1 + (q_lt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_lt[..., N:].type_as(p) - p[..., N:]))g_rb = (1 - (q_rb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_rb[..., N:].type_as(p) - p[..., N:]))g_lb = (1 + (q_lb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_lb[..., N:].type_as(p) - p[..., N:]))g_rt = (1 - (q_rt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_rt[..., N:].type_as(p) - p[..., N:]))# resampling the features based on the modified coordinates.x_q_lt = self._get_x_q(x, q_lt, N)x_q_rb = self._get_x_q(x, q_rb, N)x_q_lb = self._get_x_q(x, q_lb, N)x_q_rt = self._get_x_q(x, q_rt, N)# bilinearx_offset = g_lt.unsqueeze(dim=1) * x_q_lt + \g_rb.unsqueeze(dim=1) * x_q_rb + \g_lb.unsqueeze(dim=1) * x_q_lb + \g_rt.unsqueeze(dim=1) * x_q_rtx_offset = self._reshape_x_offset(x_offset, self.num_param)out = self.conv(x_offset)return out# generating the inital sampled shapes for the AKConv with different sizes.def _get_p_n(self, N, dtype):base_int = round(math.sqrt(self.num_param))row_number = self.num_param // base_intmod_number = self.num_param % base_intp_n_x, p_n_y = torch.meshgrid(torch.arange(0, row_number),torch.arange(0, base_int), indexing='xy')p_n_x = torch.flatten(p_n_x)p_n_y = torch.flatten(p_n_y)if mod_number > 0:mod_p_n_x, mod_p_n_y = torch.meshgrid(torch.arange(row_number, row_number + 1),torch.arange(0, mod_number), indexing='xy')mod_p_n_x = torch.flatten(mod_p_n_x)mod_p_n_y = torch.flatten(mod_p_n_y)p_n_x, p_n_y = torch.cat((p_n_x, mod_p_n_x)), torch.cat((p_n_y, mod_p_n_y))p_n = torch.cat([p_n_x, p_n_y], 0)p_n = p_n.view(1, 2 * N, 1, 1).type(dtype)return p_n# no zero-paddingdef _get_p_0(self, h, w, N, dtype):p_0_x, p_0_y = torch.meshgrid(torch.arange(0, h * self.stride, self.stride),torch.arange(0, w * self.stride, self.stride), indexing='xy')p_0_x = torch.flatten(p_0_x).view(1, 1, h, w).repeat(1, N, 1, 1)p_0_y = torch.flatten(p_0_y).view(1, 1, h, w).repeat(1, N, 1, 1)p_0 = torch.cat([p_0_x, p_0_y], 1).type(dtype)return p_0def _get_p(self, offset, dtype):N, h, w = offset.size(1) // 2, offset.size(2), offset.size(3)# (1, 2N, 1, 1)p_n = self._get_p_n(N, dtype)# (1, 2N, h, w)p_0 = self._get_p_0(h, w, N, dtype)p = p_0 + p_n + offsetreturn pdef _get_x_q(self, x, q, N):b, h, w, _ = q.size()padded_w = x.size(3)c = x.size(1)# (b, c, h*w)x = x.contiguous().view(b, c, -1)# (b, h, w, N)index = q[..., :N] * padded_w + q[..., N:]  # offset_x*w + offset_y# (b, c, h*w*N)index = index.contiguous().unsqueeze(dim=1).expand(-1, c, -1, -1, -1).contiguous().view(b, c, -1)index = index.clamp(min=0, max=x.shape[-1] - 1)x_offset = x.gather(dim=-1, index=index).contiguous().view(b, c, h, w, N)return x_offset#  Stacking resampled features in the row direction.@staticmethoddef _reshape_x_offset(x_offset, num_param):b, c, h, w, n = x_offset.size()x_offset = rearrange(x_offset, 'b c h w n -> b c (h n) w')return x_offset

结构图2：

class Block_AKConv(nn.Module):def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):super().__init__()c_ = int(c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 3, 1)self.cv3 = AKConv(c1, c2, 1, 5)self.cv2 = Conv(c_, c2, 3, 1, g=g)self.add = shortcut and c1 == c2def forward(self, x):return x + self.cv2(self.cv1(x)) + self.cv3(x) if self.add else self.cv2(self.cv1(x)) + self.cv3(x)class AKConv(nn.Module):def __init__(self, c1, c2, stride=1, num_param=5):"""初始化参数说明：inc: 输入通道数, outc: 输出通道数, num_param:（卷积核）参数量, stride = 1:卷积步长默认为1, bias = None：默认无偏执"""super(AKConv, self).__init__()self.num_param = num_paramself.stride = strideself.conv = Conv(c1, c2, k=(num_param, 1), s=(num_param, 1))self.p_conv = nn.Conv2d(c1, 2 * num_param, kernel_size=3, padding=1, stride=stride)nn.init.constant_(self.p_conv.weight, 0)self.p_conv.register_full_backward_hook(self._set_lr)@staticmethoddef _set_lr(module, grad_input, grad_output):grad_input = (grad_input[i] * 0.1 for i in range(len(grad_input)))grad_output = (grad_output[i] * 0.1 for i in range(len(grad_output)))def forward(self, x):# N is num_param.offset = self.p_conv(x)dtype = offset.data.type()N = offset.size(1) // 2# (b, 2N, h, w)p = self._get_p(offset, dtype)# (b, h, w, 2N)p = p.contiguous().permute(0, 2, 3, 1)q_lt = p.detach().floor()q_rb = q_lt + 1q_lt = torch.cat([torch.clamp(q_lt[..., :N], 0, x.size(2) - 1), torch.clamp(q_lt[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_rb = torch.cat([torch.clamp(q_rb[..., :N], 0, x.size(2) - 1), torch.clamp(q_rb[..., N:], 0, x.size(3) - 1)],dim=-1).long()q_lb = torch.cat([q_lt[..., :N], q_rb[..., N:]], dim=-1)q_rt = torch.cat([q_rb[..., :N], q_lt[..., N:]], dim=-1)# clip pp = torch.cat([torch.clamp(p[..., :N], 0, x.size(2) - 1), torch.clamp(p[..., N:], 0, x.size(3) - 1)], dim=-1)# bilinear kernel (b, h, w, N)g_lt = (1 + (q_lt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_lt[..., N:].type_as(p) - p[..., N:]))g_rb = (1 - (q_rb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_rb[..., N:].type_as(p) - p[..., N:]))g_lb = (1 + (q_lb[..., :N].type_as(p) - p[..., :N])) * (1 - (q_lb[..., N:].type_as(p) - p[..., N:]))g_rt = (1 - (q_rt[..., :N].type_as(p) - p[..., :N])) * (1 + (q_rt[..., N:].type_as(p) - p[..., N:]))# resampling the features based on the modified coordinates.x_q_lt = self._get_x_q(x, q_lt, N)x_q_rb = self._get_x_q(x, q_rb, N)x_q_lb = self._get_x_q(x, q_lb, N)x_q_rt = self._get_x_q(x, q_rt, N)# bilinearx_offset = g_lt.unsqueeze(dim=1) * x_q_lt + \g_rb.unsqueeze(dim=1) * x_q_rb + \g_lb.unsqueeze(dim=1) * x_q_lb + \g_rt.unsqueeze(dim=1) * x_q_rtx_offset = self._reshape_x_offset(x_offset, self.num_param)out = self.conv(x_offset)return out# generating the inital sampled shapes for the AKConv with different sizes.def _get_p_n(self, N, dtype):base_int = round(math.sqrt(self.num_param))row_number = self.num_param // base_intmod_number = self.num_param % base_intp_n_x, p_n_y = torch.meshgrid(torch.arange(0, row_number),torch.arange(0, base_int), indexing='xy')p_n_x = torch.flatten(p_n_x)p_n_y = torch.flatten(p_n_y)if mod_number > 0:mod_p_n_x, mod_p_n_y = torch.meshgrid(torch.arange(row_number, row_number + 1),torch.arange(0, mod_number), indexing='xy')mod_p_n_x = torch.flatten(mod_p_n_x)mod_p_n_y = torch.flatten(mod_p_n_y)p_n_x, p_n_y = torch.cat((p_n_x, mod_p_n_x)), torch.cat((p_n_y, mod_p_n_y))p_n = torch.cat([p_n_x, p_n_y], 0)p_n = p_n.view(1, 2 * N, 1, 1).type(dtype)return p_n# no zero-paddingdef _get_p_0(self, h, w, N, dtype):p_0_x, p_0_y = torch.meshgrid(torch.arange(0, h * self.stride, self.stride),torch.arange(0, w * self.stride, self.stride), indexing='xy')p_0_x = torch.flatten(p_0_x).view(1, 1, h, w).repeat(1, N, 1, 1)p_0_y = torch.flatten(p_0_y).view(1, 1, h, w).repeat(1, N, 1, 1)p_0 = torch.cat([p_0_x, p_0_y], 1).type(dtype)return p_0def _get_p(self, offset, dtype):N, h, w = offset.size(1) // 2, offset.size(2), offset.size(3)# (1, 2N, 1, 1)p_n = self._get_p_n(N, dtype)# (1, 2N, h, w)p_0 = self._get_p_0(h, w, N, dtype)p = p_0 + p_n + offsetreturn pdef _get_x_q(self, x, q, N):b, h, w, _ = q.size()padded_w = x.size(3)c = x.size(1)# (b, c, h*w)x = x.contiguous().view(b, c, -1)# (b, h, w, N)index = q[..., :N] * padded_w + q[..., N:]  # offset_x*w + offset_y# (b, c, h*w*N)index = index.contiguous().unsqueeze(dim=1).expand(-1, c, -1, -1, -1).contiguous().view(b, c, -1)index = index.clamp(min=0, max=x.shape[-1] - 1)x_offset = x.gather(dim=-1, index=index).contiguous().view(b, c, h, w, N)return x_offset#  Stacking resampled features in the row direction.@staticmethoddef _reshape_x_offset(x_offset, num_param):b, c, h, w, n = x_offset.size()x_offset = rearrange(x_offset, 'b c h w n -> b c (h n) w')return x_offset

结构三：

上面是BottleNeck模块融合AKConv的几种方式，当然不局限于此，还可以并联串联更多模块，或使用Concat多处拼接特征层后卷积，或者增加相加、拼接条件等等，如下图：

以上种种就属于模块的二次融合创新了，有没有效果还要具体在自己的数据集上测试。最后是最高一层的创新，学习优秀模块思路，组建自己的模块。这里比较典型的模块结构（网络）就是Resnet残差块，可以说没有何凯明的Resnet，深度学习难以发展到现在的高度，这也是深度学习的基石性研究。

论文连接：Deep Residual Learning for Image Recognition (cv-foundation.org)

Resnet文章旨在解决深层网络的退化问题，文中最重要的结构如下：

深度神经网络中的Inception模块结构如下：

Resnet可以解决网络退化问题，但是频繁操作输入张量会造成网络臃肿。Inception虽然可以提取多尺度特征，但是会有一定的网络退化问题。接下来大家可以思考一个问题，如何将两种模块的思想融合到一起，组成一个新的模块优化上面的问题呢？那不妨看下今年CVPR2024最新的文章InceptionNeXt。

文章链接：https://arxiv.org/pdf/2303.16900

class InceptionDWConv2d(nn.Module):""" Inception depthweise convolution"""def __init__(self, in_channels, square_kernel_size=3, band_kernel_size=11, branch_ratio=0.125):super().__init__()gc = int(in_channels * branch_ratio)  # channel numbers of a convolution branchself.dwconv_hw = nn.Conv2d(gc, gc, square_kernel_size, padding=square_kernel_size // 2, groups=gc)self.dwconv_w = nn.Conv2d(gc, gc, kernel_size=(1, band_kernel_size), padding=(0, band_kernel_size // 2),groups=gc)self.dwconv_h = nn.Conv2d(gc, gc, kernel_size=(band_kernel_size, 1), padding=(band_kernel_size // 2, 0),groups=gc)self.split_indexes = (in_channels - 3 * gc, gc, gc, gc)def forward(self, x):x_id, x_hw, x_w, x_h = torch.split(x, self.split_indexes, dim=1)return torch.cat((x_id, self.dwconv_hw(x_hw), self.dwconv_w(x_w), self.dwconv_h(x_h)),dim=1,)

结构上，InceptionDWConv2d首先对输入的张量x切分，分别执行卷积核大小3×3、1×11、11×1的三组深度可分离卷积和一组恒等映射，既能感知多尺度特征，同时只对一部分x进行操作，减小了计算量，主要代码也很短，仅有简单的四行发顶会。

至此，本文主要内容结束。