5 手写卷积函数

背景
介绍
滑动窗口的方式
- 代码
- 问题
矩阵乘法的方式
- 原理
- 代码
- 结果
效果对比
- 对比代码
- 日志
- 结果
一些思考

背景

从现在开始各种手写篇章，先从最经典的卷积开始

介绍

对于卷积层的具体操作，我这里就不在具体说卷积具体是什么东西了。
对于手写卷积操作而言，有两种方式，一种就是最朴素的通过滑动窗口来实现的方式，另一种方式就是使用矩阵乘法来简化操作过程的方式。

滑动窗口的方式

在这里插入图片描述

卷积操作的动图https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md

通过上面的图片和连接就可以很直观地感受到卷积操作的方式，也能很直接想到使用简单的滑动窗口来实现，如果还不能理解，建议去B站搜下视频学习下

代码

"""
-*- coding: utf-8 -*-
使用滑动窗口方式的手动卷积
@Author : Leezed
@Time : 2025/6/27 15:33
"""import numpy as npclass ManualSlideWindowConv():"""手动实现卷积操作，使用滑动窗口方式没有实现反向传播功能"""def __init__(self, kernel_size, in_channel, out_channel, stride=1, padding=0, bias=True):self.kernel_size = kernel_sizeself.in_channel = in_channelself.out_channel = out_channelself.stride = strideself.padding = paddingself.bias = biasself.weight = np.random.randn(out_channel, in_channel, kernel_size, kernel_size)if bias:self.bias = np.random.randn(out_channel)else:self.bias = Nonedef print_weight(self):print("Weight shape:", self.weight.shape)print("Weight values:\n", self.weight)def get_weight(self):return self.weightdef set_weight(self, weight):if weight.shape != self.weight.shape:raise ValueError(f"Weight shape mismatch: expected {self.weight.shape}, got {weight.shape}")self.weight = weightdef __call__(self, x, *args, **kwargs):if self.padding > 0:x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')  # 在四周填充0batch_size, in_channel, height, width = x.shapekernel_size = self.kernel_size# 计算输出的高度和宽度out_height = (height - kernel_size) // self.stride + 1out_width = (width - kernel_size) // self.stride + 1output = np.zeros((batch_size, self.out_channel, out_height, out_width))for channel in range(self.out_channel):# 取出当前输出通道的权重kernel = self.weight[channel, :, :, :]# 添加biasif self.bias is not None:output[:, channel, :, :] += self.bias[channel]else:output[:, channel, :, :] = 0for i, end_height in enumerate(range(kernel_size - 1, height, self.stride)):for j, end_width in enumerate(range(kernel_size - 1, width, self.stride)):# 取出图像的滑动窗口start_height = end_height - kernel_size + 1start_width = end_width - kernel_size + 1window = x[:, :, start_height:end_height + 1, start_width:end_width + 1]# 计算卷积result = np.sum(kernel * window, axis=(1, 2, 3))output[:, channel, i, j] += resultreturn outputif __name__ == '__main__':# 测试代码x = np.random.randn(2, 3, 5, 5)  # batch_size=2, in_channel=3, height=5, width=5conv_layer = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=1)output = conv_layer(x)print("Output shape:", output.shape)conv_layer.print_weight()

问题

但是活动卷积的方式有一个问题就是这个方式太费时了，因为有三层循环，而对于python而言是用循环去计算是一件费力不讨好的事情，具体的时间花费我会在后面画出图来直观地展现

矩阵乘法的方式

原理

https://zhuanlan.zhihu.com/p/360859627
https://gist.github.com/hsm207/7bfbe524bfd9b60d1a9e209759064180
https://blog.csdn.net/caip12999203000/article/details/126494740

具体的原理我就不赘述了，上面的三个链接认真看也能看明白了，他的本质思想就是讲滑动窗口中的多次乘法，直接改成矩阵乘法，通过这种方式来进行加速，而且加速的幅度不小，但是会生成一个较大的矩阵，不可避免的带来内存的开销，也就是说本质是拿空间换时间

具体的例子就在图中
在这里插入图片描述
mmConv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)的卷积层面对
x = np.random.randn(64, 3, 224, 224).astype(np.float32)的特征就要吃1.5G+的内存了。

代码

class ManualMatMulConv():"""手动实现卷积操作，使用卷积乘法方式没有实现反向传播功能"""def __init__(self, kernel_size, in_channel, out_channel, stride=1, padding=0, bias=True):self.kernel_size = kernel_sizeself.in_channel = in_channelself.out_channel = out_channelself.stride = strideself.padding = paddingself.bias = biasself.weight = np.random.randn(out_channel, in_channel, kernel_size, kernel_size)if bias:self.bias = np.random.randn(out_channel)else:self.bias = Nonedef print_weight(self):print("Weight shape:", self.weight.shape)print("Weight values:\n", self.weight)def get_weight(self):return self.weightdef set_weight(self, weight):if weight.shape != self.weight.shape:raise ValueError(f"Weight shape mismatch: expected {self.weight.shape}, got {weight.shape}")self.weight = weightdef __call__(self, x, *args, **kwargs):if self.padding > 0:x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')  # 在四周填充0batch_size, in_channel, height, width = x.shapekernel_size = self.kernel_size# 计算输出的高度和宽度out_height = (height - kernel_size) // self.stride + 1out_width = (width - kernel_size) // self.stride + 1# 将权重转换为矩阵形式weight_matrix = self.weight.reshape(self.out_channel, -1)  # shape (out_channel, in_channel * kernel_size * kernel_size)# 将输入转为矩阵形式 手写unfold方式unfolded_x = []for i in range(0, height - kernel_size + 1, self.stride):for j in range(0, width - kernel_size + 1, self.stride):# 取出图像的滑动窗口 转成矩阵形式window = x[:, :, i:i + kernel_size, j:j + kernel_size].reshape(batch_size, -1)unfolded_x.append(window)unfolded_x = np.array(unfolded_x)  # shape: (num_windows, batch_size, in_channel * kernel_size * kernel_size)unfolded_x = np.transpose(unfolded_x, (1, 0, 2))  # shape: (batch_size, num_windows, in_channel * kernel_size * kernel_size)# 使用矩阵乘法计算卷积output = np.matmul(unfolded_x, weight_matrix.T)  # shape (batch_size, num_windows, out_channel)output = np.transpose(output, (0, 2, 1))  # shape (batch_size, out_channel, num_windows)output = output.reshape(batch_size, self.out_channel, out_height, out_width)# 添加biasif self.bias is not None:output += self.bias.reshape(1, -1, 1, 1)# 输出结果return output

结果

检测代码

if __name__ == '__main__':# 测试代码conv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=0, bias=False)slide_window_conv = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=0, bias=False)conv.set_weight(slide_window_conv.get_weight())x = np.random.randn(1, 3, 5, 5)  # 输入形状 (batch_size, in_channel, height, width)output = conv(x)slide_window_output = slide_window_conv(x)print("Output shape:", output.shape)print("slide_window_output shape:", slide_window_output.shape)assert np.allclose(conv.get_weight(), slide_window_conv.get_weight()), "Weights do not match!"print("output:")print(output)print("slide_window_output:")print(slide_window_output)# 校验是否相同assert np.allclose(output, slide_window_output), "Outputs do not match!"print("Outputs match!")

在这里插入图片描述

效果对比

这里采取了四种方式的卷积来进行对比

滑动窗口方式的卷积
矩阵乘法的卷积
torch.nn.Conv2d
torch.nn.Conv2d 使用cuda

对比代码

import numpy as np
from matplotlib import pyplot as plt
from manual.conv.slide_window import ManualSlideWindowConv
from manual.conv.matmul import ManualMatMulConv
import torch
import time# 对比不同batchsize的卷积速度speeds = {'manual_matmul': [],'manual_slide_window': [],'torch': [],'torch_cuda': []
}swConv = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)
mmConv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)
torchConv = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
torchCudaConv = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1).cuda()def timing_conv(conv,x):start = time.time()y = conv(x)end = time.time()return y, end - startfor bs in [1, 2, 4, 8, 16, 32]:x = np.random.randn(bs, 3, 224, 224).astype(np.float32)x_torch = torch.from_numpy(x)x_torch_cuda = x_torch.cuda()y, speed = timing_conv(swConv, x)speeds['manual_slide_window'].append(speed)print(f'slide_window bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(mmConv, x)speeds['manual_matmul'].append(speed)print(f'matmul bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(torchConv, x_torch)speeds['torch'].append(speed)print(f'torch bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(torchCudaConv, x_torch_cuda)speeds['torch_cuda'].append(speed)print(f'torch_cuda bs={bs}, speed={speed:.4f}s')print('-' * 50)

日志

slide_window bs=1, speed=39.8342s
matmul bs=1, speed=0.1436s
torch bs=1, speed=0.0080s
torch_cuda bs=1, speed=0.0000s
--------------------------------------------------
slide_window bs=2, speed=39.8841s
matmul bs=2, speed=0.2185s
torch bs=2, speed=0.0172s
torch_cuda bs=2, speed=0.0010s
--------------------------------------------------
slide_window bs=4, speed=44.0416s
matmul bs=4, speed=0.3975s
torch bs=4, speed=0.0329s
torch_cuda bs=4, speed=0.0000s
--------------------------------------------------
slide_window bs=8, speed=41.7520s
matmul bs=8, speed=0.3222s
torch bs=8, speed=0.0588s
torch_cuda bs=8, speed=0.0000s
--------------------------------------------------
slide_window bs=16, speed=45.5278s
matmul bs=16, speed=0.5858s
torch bs=16, speed=0.1067s
torch_cuda bs=16, speed=0.0010s
--------------------------------------------------
slide_window bs=32, speed=58.1965s
matmul bs=32, speed=1.2161s
torch bs=32, speed=0.2045s
torch_cuda bs=32, speed=0.0010s
--------------------------------------------------

结果

在这里插入图片描述

去掉最慢的滑动窗口的结果展示
在这里插入图片描述
可以看到矩阵乘法的方式还是挺快的，至少比滑动窗口快多了

一些思考

但是随之而来的就是还有一个问题，为什么矩阵乘法的方式的内存开销这么大，但是torch.nn.Conv2d好像并没有这个问题

经过查阅了一些资料，我简单总结一下

瓦片式（Tiling）或分块（Blocking）计算

虽然矩阵乘法概念上是将整个输入和卷积核展开，但实际的硬件实现（如GPU）并不总是一次性处理所有数据。它们可能会将计算任务分解成更小的“瓦片”或“块”。

局部矩阵乘法：不是一次性将整个图像展开，而是每次只对输入的一小部分（例如一个批次、或者一个输出通道的一个小区域）进行 im2col 变换和矩阵乘法。这样可以限制中间矩阵的大小，从而减少瞬时内存占用。计算完成后，再将结果拼接到最终的输出特征图上。

重用数据：这种分块策略有助于更好地利用 CPU 缓存或 GPU 显存，因为同一小块数据可以在其被完全处理完毕前，反复用于计算，减少数据在主存和缓存之间的移动。
智能算法选择
根据卷积参数动态选择最适合的底层算法（如 Winograd, FFT, 或优化过的直接卷积），而不是单一地依赖 im2col。

这也就是为啥我们写出来的方法跟官方版本的有差距的原因。