5 手写卷积函数
- 背景
- 介绍
- 滑动窗口的方式
- 代码
- 问题
- 矩阵乘法的方式
- 原理
- 代码
- 结果
- 效果对比
- 对比代码
- 日志
- 结果
- 一些思考
背景
从现在开始各种手写篇章,先从最经典的卷积开始
介绍
对于卷积层的具体操作,我这里就不在具体说卷积具体是什么东西了。
对于手写卷积操作而言,有两种方式,一种就是最朴素的通过滑动窗口来实现的方式,另一种方式就是使用矩阵乘法来简化操作过程的方式。
滑动窗口的方式
卷积操作的动图https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
通过上面的图片和连接就可以很直观地感受到卷积操作的方式,也能很直接想到使用简单的滑动窗口来实现,如果还不能理解,建议去B站搜下视频学习下
代码
"""
-*- coding: utf-8 -*-
使用滑动窗口方式的手动卷积
@Author : Leezed
@Time : 2025/6/27 15:33
"""import numpy as npclass ManualSlideWindowConv():"""手动实现卷积操作,使用滑动窗口方式没有实现反向传播功能"""def __init__(self, kernel_size, in_channel, out_channel, stride=1, padding=0, bias=True):self.kernel_size = kernel_sizeself.in_channel = in_channelself.out_channel = out_channelself.stride = strideself.padding = paddingself.bias = biasself.weight = np.random.randn(out_channel, in_channel, kernel_size, kernel_size)if bias:self.bias = np.random.randn(out_channel)else:self.bias = Nonedef print_weight(self):print("Weight shape:", self.weight.shape)print("Weight values:\n", self.weight)def get_weight(self):return self.weightdef set_weight(self, weight):if weight.shape != self.weight.shape:raise ValueError(f"Weight shape mismatch: expected {self.weight.shape}, got {weight.shape}")self.weight = weightdef __call__(self, x, *args, **kwargs):if self.padding > 0:x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant') # 在四周填充0batch_size, in_channel, height, width = x.shapekernel_size = self.kernel_size# 计算输出的高度和宽度out_height = (height - kernel_size) // self.stride + 1out_width = (width - kernel_size) // self.stride + 1output = np.zeros((batch_size, self.out_channel, out_height, out_width))for channel in range(self.out_channel):# 取出当前输出通道的权重kernel = self.weight[channel, :, :, :]# 添加biasif self.bias is not None:output[:, channel, :, :] += self.bias[channel]else:output[:, channel, :, :] = 0for i, end_height in enumerate(range(kernel_size - 1, height, self.stride)):for j, end_width in enumerate(range(kernel_size - 1, width, self.stride)):# 取出图像的滑动窗口start_height = end_height - kernel_size + 1start_width = end_width - kernel_size + 1window = x[:, :, start_height:end_height + 1, start_width:end_width + 1]# 计算卷积result = np.sum(kernel * window, axis=(1, 2, 3))output[:, channel, i, j] += resultreturn outputif __name__ == '__main__':# 测试代码x = np.random.randn(2, 3, 5, 5) # batch_size=2, in_channel=3, height=5, width=5conv_layer = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=1)output = conv_layer(x)print("Output shape:", output.shape)conv_layer.print_weight()
问题
但是活动卷积的方式有一个问题就是这个方式太费时了,因为有三层循环,而对于python而言是用循环去计算是一件费力不讨好的事情,具体的时间花费我会在后面画出图来直观地展现
矩阵乘法的方式
原理
https://zhuanlan.zhihu.com/p/360859627
https://gist.github.com/hsm207/7bfbe524bfd9b60d1a9e209759064180
https://blog.csdn.net/caip12999203000/article/details/126494740
具体的原理我就不赘述了,上面的三个链接认真看也能看明白了,他的本质思想就是讲滑动窗口中的多次乘法,直接改成矩阵乘法,通过这种方式来进行加速,而且加速的幅度不小,但是会生成一个较大的矩阵,不可避免的带来内存的开销,也就是说本质是拿空间换时间
具体的例子就在图中
mmConv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)的卷积层面对
x = np.random.randn(64, 3, 224, 224).astype(np.float32)的特征就要吃1.5G+的内存了。
代码
class ManualMatMulConv():"""手动实现卷积操作,使用卷积乘法方式没有实现反向传播功能"""def __init__(self, kernel_size, in_channel, out_channel, stride=1, padding=0, bias=True):self.kernel_size = kernel_sizeself.in_channel = in_channelself.out_channel = out_channelself.stride = strideself.padding = paddingself.bias = biasself.weight = np.random.randn(out_channel, in_channel, kernel_size, kernel_size)if bias:self.bias = np.random.randn(out_channel)else:self.bias = Nonedef print_weight(self):print("Weight shape:", self.weight.shape)print("Weight values:\n", self.weight)def get_weight(self):return self.weightdef set_weight(self, weight):if weight.shape != self.weight.shape:raise ValueError(f"Weight shape mismatch: expected {self.weight.shape}, got {weight.shape}")self.weight = weightdef __call__(self, x, *args, **kwargs):if self.padding > 0:x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant') # 在四周填充0batch_size, in_channel, height, width = x.shapekernel_size = self.kernel_size# 计算输出的高度和宽度out_height = (height - kernel_size) // self.stride + 1out_width = (width - kernel_size) // self.stride + 1# 将权重转换为矩阵形式weight_matrix = self.weight.reshape(self.out_channel, -1) # shape (out_channel, in_channel * kernel_size * kernel_size)# 将输入转为矩阵形式 手写unfold方式unfolded_x = []for i in range(0, height - kernel_size + 1, self.stride):for j in range(0, width - kernel_size + 1, self.stride):# 取出图像的滑动窗口 转成矩阵形式window = x[:, :, i:i + kernel_size, j:j + kernel_size].reshape(batch_size, -1)unfolded_x.append(window)unfolded_x = np.array(unfolded_x) # shape: (num_windows, batch_size, in_channel * kernel_size * kernel_size)unfolded_x = np.transpose(unfolded_x, (1, 0, 2)) # shape: (batch_size, num_windows, in_channel * kernel_size * kernel_size)# 使用矩阵乘法计算卷积output = np.matmul(unfolded_x, weight_matrix.T) # shape (batch_size, num_windows, out_channel)output = np.transpose(output, (0, 2, 1)) # shape (batch_size, out_channel, num_windows)output = output.reshape(batch_size, self.out_channel, out_height, out_width)# 添加biasif self.bias is not None:output += self.bias.reshape(1, -1, 1, 1)# 输出结果return output
结果
检测代码
if __name__ == '__main__':# 测试代码conv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=0, bias=False)slide_window_conv = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=2, stride=1, padding=0, bias=False)conv.set_weight(slide_window_conv.get_weight())x = np.random.randn(1, 3, 5, 5) # 输入形状 (batch_size, in_channel, height, width)output = conv(x)slide_window_output = slide_window_conv(x)print("Output shape:", output.shape)print("slide_window_output shape:", slide_window_output.shape)assert np.allclose(conv.get_weight(), slide_window_conv.get_weight()), "Weights do not match!"print("output:")print(output)print("slide_window_output:")print(slide_window_output)# 校验是否相同assert np.allclose(output, slide_window_output), "Outputs do not match!"print("Outputs match!")
效果对比
这里采取了四种方式的卷积来进行对比
- 滑动窗口方式的卷积
- 矩阵乘法的卷积
- torch.nn.Conv2d
- torch.nn.Conv2d 使用cuda
对比代码
import numpy as np
from matplotlib import pyplot as plt
from manual.conv.slide_window import ManualSlideWindowConv
from manual.conv.matmul import ManualMatMulConv
import torch
import time# 对比不同batchsize的卷积速度speeds = {'manual_matmul': [],'manual_slide_window': [],'torch': [],'torch_cuda': []
}swConv = ManualSlideWindowConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)
mmConv = ManualMatMulConv(kernel_size=3, in_channel=3, out_channel=64,padding=1)
torchConv = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
torchCudaConv = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1).cuda()def timing_conv(conv,x):start = time.time()y = conv(x)end = time.time()return y, end - startfor bs in [1, 2, 4, 8, 16, 32]:x = np.random.randn(bs, 3, 224, 224).astype(np.float32)x_torch = torch.from_numpy(x)x_torch_cuda = x_torch.cuda()y, speed = timing_conv(swConv, x)speeds['manual_slide_window'].append(speed)print(f'slide_window bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(mmConv, x)speeds['manual_matmul'].append(speed)print(f'matmul bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(torchConv, x_torch)speeds['torch'].append(speed)print(f'torch bs={bs}, speed={speed:.4f}s')y, speed = timing_conv(torchCudaConv, x_torch_cuda)speeds['torch_cuda'].append(speed)print(f'torch_cuda bs={bs}, speed={speed:.4f}s')print('-' * 50)
日志
slide_window bs=1, speed=39.8342s
matmul bs=1, speed=0.1436s
torch bs=1, speed=0.0080s
torch_cuda bs=1, speed=0.0000s
--------------------------------------------------
slide_window bs=2, speed=39.8841s
matmul bs=2, speed=0.2185s
torch bs=2, speed=0.0172s
torch_cuda bs=2, speed=0.0010s
--------------------------------------------------
slide_window bs=4, speed=44.0416s
matmul bs=4, speed=0.3975s
torch bs=4, speed=0.0329s
torch_cuda bs=4, speed=0.0000s
--------------------------------------------------
slide_window bs=8, speed=41.7520s
matmul bs=8, speed=0.3222s
torch bs=8, speed=0.0588s
torch_cuda bs=8, speed=0.0000s
--------------------------------------------------
slide_window bs=16, speed=45.5278s
matmul bs=16, speed=0.5858s
torch bs=16, speed=0.1067s
torch_cuda bs=16, speed=0.0010s
--------------------------------------------------
slide_window bs=32, speed=58.1965s
matmul bs=32, speed=1.2161s
torch bs=32, speed=0.2045s
torch_cuda bs=32, speed=0.0010s
--------------------------------------------------
结果
去掉最慢的滑动窗口的结果展示
可以看到矩阵乘法的方式还是挺快的,至少比滑动窗口快多了
一些思考
但是随之而来的就是还有一个问题,为什么矩阵乘法的方式的内存开销这么大,但是torch.nn.Conv2d好像并没有这个问题
经过查阅了一些资料,我简单总结一下
-
瓦片式(Tiling)或分块(Blocking)计算
虽然 矩阵乘法 概念上是将整个输入和卷积核展开,但实际的硬件实现(如GPU)并不总是一次性处理所有数据。它们可能会将计算任务分解成更小的“瓦片”或“块”。
局部 矩阵乘法: 不是一次性将整个图像展开,而是每次只对输入的一小部分(例如一个批次、或者一个输出通道的一个小区域)进行 im2col 变换和矩阵乘法。这样可以限制中间矩阵的大小,从而减少瞬时内存占用。计算完成后,再将结果拼接到最终的输出特征图上。
重用数据: 这种分块策略有助于更好地利用 CPU 缓存或 GPU 显存,因为同一小块数据可以在其被完全处理完毕前,反复用于计算,减少数据在主存和缓存之间的移动。
-
智能算法选择
根据卷积参数动态选择最适合的底层算法(如 Winograd, FFT, 或优化过的直接卷积),而不是单一地依赖 im2col。
这也就是为啥我们写出来的方法跟官方版本的有差距的原因。