cuda编程笔记（18）-- 使用im2col + GEMM 实现卷积

我们之前介绍了cudnn调用api直接实现卷积，本文我们探究手动实现。

对于直接使用for循环在cpu上的实现方法，就不过多介绍，只要了解卷积的原理，就很容易实现。

im2col 的核心思想

im2col = image to column

把输入 feature map 的每个卷积感受野（sliding window）展开成一列向量
卷积核也展开成一个行向量
然后把卷积转化成 矩阵乘法（GEMM, General Matrix Multiply）

举例

假设：

输入：1 channel, 4×4
卷积核：1×2×2
步长 stride=1

输入：

1  2  3  4
5  6  7  8
9  10 11 12
13 14 15 16

卷积核 2×2 的每个 sliding window：

[[1,2],[5,6]] → 展开为 [1,2,5,6]
[[2,3],[6,7]] → 展开为 [2,3,6,7]
...

用 im2col 后：

X_col = [[1, 2, 5, 6],   # 第一个位置[2, 3, 6, 7],   # 第二个位置[3, 4, 7, 8],...]

卷积核也展开成：

W_col = [w1, w2, w3, w4]

然后 卷积计算就变成矩阵乘法：

$Y=W_{col}\cdot X_{col}$

输出每个位置就是矩阵乘法的一个元素
对多通道、多卷积核也可以批量做

说人话，就是把卷积核每次对准的这块矩阵区域展平成向量；比如X_col的第一行，与W_col作向量乘法，结果就是第一次卷积得到的结果。

但是一般我们会将W_col的参数作为一行，所以X_col实际存储需要转置一下，这样才符合矩阵乘法的要求

为什么效率高？

GEMM 有高度优化
- BLAS/cuBLAS/cuDNN 都对矩阵乘法做了很多优化
- 可以充分利用 SIMD / GPU 并行
循环嵌套少
- 原始卷积是 6 层循环（batch, output channel, height, width, input channel, kernel height/width）
- im2col + GEMM → 只要一次矩阵乘法
容易扩展到多通道、多 batch

代码实现

代码中对应的X_col就是按列存储展开的元素

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include <iostream>
#include<cstdio>
#include <cmath>
#include <cstdlib>
#include <vector>__global__ void im2col_kernel(const float *data_im,//输入图片数据 [C,H,W]int channels,int height,int width,// 输入通道数 C,输入高度 H, 宽度 Wint ksize,int pad,int stride,// 卷积核大小 (假设方形: kH=kW=ksize),padding 大小,卷积步长int height_col,int width_col,// 输出特征图的高宽float *data_col){ // im2col 展开的矩阵输出 [C*ksize*ksize, H_out*W_out]int index=blockIdx.x*blockDim.x+threadIdx.x;//index: 每个线程负责展开一个卷积窗口中的一个元素。int n=channels*ksize*ksize*height_col*width_col;//n: 总的元素数if(index<n){//对比一下w_out*h_out*k_idx*k_w*k_h与n就知道为什么要这么计算这五个变量了//w_out, h_out: 代表当前处理的输出特征图位置 (卷积结果的坐标)。int w_out=index%width_col;int h_out=(index/width_col)%height_col;//k_idx → (c_in, k_h, k_w): 把卷积核的 index 分解成通道号、核内行列。int k_idx=(index/width_col/height_col);int k_w=k_idx%ksize;int k_h=(k_idx/ksize)%ksize;//c_in输入通道索引int c_in=k_idx/(ksize*ksize);//im_row, im_col: 对应输入图片上的真实位置（考虑 stride、padding 后）//这是经典的把输出坐标映射回输入坐标的公式int im_row=h_out*stride-pad+k_h;int im_col=w_out*stride-pad+k_w;//col_index: data_col 的存储索引int col_index=(c_in*ksize*ksize+k_h*ksize+k_w)* (height_col * width_col) + h_out * width_col + w_out;float val=0;//如果 im_row, im_col 在输入图像范围内 → 取值；否则属于padding → 0if(im_row>=0&&im_row<height&&im_col>=0&&im_col<width){val=data_im[(c_in*height+im_row)*width+im_col];}data_col[col_index]=val;}
}
void im2col_gpu(const float *d_im,int channels,int height,int width,int ksize,int pad,int stride,float *d_col){//计算输出大小：就是卷积输出 H_out, W_out 的公式。int height_col=(height +2*pad-ksize)/stride+1;int width_col=(width+2*pad-ksize)/stride+1;int n=channels*ksize*ksize*height_col*width_col;int threads=256;int blocks=(n+threads-1)/threads;im2col_kernel<<<blocks,threads>>>(d_im,channels,height,width,ksize,pad,stride,height_col,width_col,d_col);cudaDeviceSynchronize();
}
void conv_forward_im2col(const float *d_input,// 输入图像 [C,H,W]const float *d_weight,// 卷积核 [K,C,ksize,ksize]float *d_output,//输出 [K,H_out,W_out]int C,int H,int W,// 输入通道数，高，宽int K,// 卷积核数量 (输出通道数)int ksize,int stride,int pad,// 核大小，步长，填充cublasHandle_t &handle){int H_out=(H+2*pad-ksize)/stride+1;int W_out=(W+2*pad-ksize)/stride+1;float *d_col;// im2col buffer: [C*ksize*ksize, H_out*W_out]cudaMalloc(&d_col,sizeof(float)*C*ksize*ksize*H_out*W_out);im2col_gpu(d_input,C,H,W,ksize,pad,stride,d_col);//原本是W*x的顺序，但是由于cublasSgemm函数的特性，写的时候参照实际传递的方式// GEMM: d_weight:[K, C*ksize*ksize] * d_col:[C*ksize*ksize, H_out*W_out] = [K, H_out*W_out]const float alpha=1.0f,beta=0.0f;cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,H_out*W_out,K,C*ksize*ksize,&alpha,d_col,H_out*W_out,d_weight,C*ksize*ksize,&beta,d_output,H_out*W_out);cudaFree(d_col);
}
int main() {int C=1, H=5, W=5;int K=1, ksize=3, stride=1, pad=1;std::vector<float> h_input(C*H*W, 1.0f);   // 全1输入std::vector<float> h_weight(K*C*ksize*ksize, 1.0f); // 全1卷积核std::vector<float> h_output(K*H*W);float *d_input, *d_weight, *d_output;cudaMalloc(&d_input, h_input.size()*sizeof(float));cudaMalloc(&d_weight, h_weight.size()*sizeof(float));cudaMalloc(&d_output, h_output.size()*sizeof(float));cudaMemcpy(d_input, h_input.data(), h_input.size()*sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(d_weight, h_weight.data(), h_weight.size()*sizeof(float), cudaMemcpyHostToDevice);cublasHandle_t handle;cublasCreate(&handle);conv_forward_im2col(d_input, d_weight, d_output,C,H,W,K,ksize,stride,pad, handle);cudaMemcpy(h_output.data(), d_output, h_output.size()*sizeof(float), cudaMemcpyDeviceToHost);cublasDestroy(handle);// 打印结果int H_out=(H+2*pad-ksize)/stride+1;int W_out=(W+2*pad-ksize)/stride+1;std::cout << "Output (" << K << "," << H_out << "," << W_out << "):\n";for(int i=0;i<K;i++){for(int h=0;h<H_out;h++){for(int w=0;w<W_out;w++){std::cout << h_output[i*H_out*W_out+h*W_out+w] << " ";}std::cout << "\n";}}cudaFree(d_input);cudaFree(d_weight);cudaFree(d_output);return 0;
}

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。
如若转载，请注明出处：http://www.pswp.cn/diannao/98013.shtml
繁体地址，请注明出处：http://hk.pswp.cn/diannao/98013.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！