http://blog.csdn.net/xizero00/article/details/51049858

一、卷积层的作用简介

卷积层是深度神经网络中的一个重要的层，该层实现了局部感受野，通过这种局部感受野，可以有效地降低参数的数目。

我们将结合caffe来讲解具体是如何实现卷积层的前传和反传的。至于是如何前传和反传的原理可以参考Notes on Convolutional Neural Networks，具体请百度或者谷歌，即可下载到。

Caffe中的master分支已经将vision_layers.hpp中的各个层分散到layers中去了，因此如果你是主分支的代码，请在include/layers中找BaseConvolutionLayer和ConvolutionLayer的头文件的定义。

二、卷积层的详细介绍

1）构造函数

[cpp] view plaincopy  
 // 构造函数  
   explicit BaseConvolutionLayer(const LayerParameter& param)  
       : Layer<Dtype>(param) {}  

2）成员变量

[cpp] view plaincopy  
 /// @brief The spatial dimensions of a filter kernel.  
   // kernel的形状 = [kernel_h, kernel_w]  
   Blob<int> kernel_shape_;  
   
   /// @brief The spatial dimensions of the stride.  
   // 步长形状 = [stride_h, stride_w]  
   Blob<int> stride_;  
   
   /// @brief The spatial dimensions of the padding.  
   // pad的形状 = [pad_h, pad_w]  
   Blob<int> pad_;  
   
   /// @brief The spatial dimensions of the convolution input.  
   // 卷积的输入形状 = [输入图像通道数, 输入图像h,    输入图像w]  
   Blob<int> conv_input_shape_;  
   
   /// @brief The spatial dimensions of the col_buffer.  
   // col_buffer的形状 = [kernel_dim_, conv_out_spatial_dim_ ]  
   vector<int> col_buffer_shape_;  
   
   /// @brief The spatial dimensions of the output.  
   // 输出的形状  
   vector<int> output_shape_;  
   
   // 输入的形状  
   const vector<int>* bottom_shape_;  
   
   // 空间轴个数  
   int num_spatial_axes_;  
   
   // 输入度维度 = 输入图像通道数*输入图像的h*输入图像w  
   int bottom_dim_;  
   
   // 输出维度 = 输出通道数*输出h*输出w  
   int top_dim_;  
   
   // 输入图像的第几个轴是通道  
   int channel_axis_;  
   
   // batchsize  
   int num_;  
   
   // 输入图像的通道数  
   int channels_;  
   
   // 卷积组的大小  
   int group_;  
   
   // 输出空间维度 = 卷积之后的图像长*卷积之后图像的宽  
   int out_spatial_dim_;  
   
   // 使用卷积组用到的  
   int weight_offset_;  
   
   // 卷积后的图像的通道数  
   int num_output_;  
   
   // 是否启用偏置  
   bool bias_term_;  
   
   // 是不是1x1卷积  
   bool is_1x1_;  
   
   // 强制使用n维通用卷积  
   bool force_nd_im2col_;  
   
   // conv_in_channels_ * conv_out_spatial_dim_  
   int num_kernels_im2col_;  
   // num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_  
   int num_kernels_col2im_;  
   
   // 卷积的输出通道数 ,在参数配置文件中设置  
   int conv_out_channels_;  
   
   // 卷积的输入通道数 （即输入图像的通道数）  
   int conv_in_channels_;  
   
   // 卷积的输出的空间维度 = 卷积后图像h*卷积后图像w  
   int conv_out_spatial_dim_;  
   
   // 卷积核的维度 = 输入图像的维度*卷积核的h*卷积核的w  
   int kernel_dim_;  
   
   // 在使用gropu参数的时候使用的offset  
   int col_offset_;  
   int output_offset_;  
     
   // im2col的时候使用的存储空间  
   Blob<Dtype> col_buffer_;  
   
   // 将偏置扩展成矩阵的东东  
   Blob<Dtype> bias_multiplier_;  

为了更为准确地解释，我把代码中的各个变量的含义也贴进来：

[cpp] view plaincopy  
 channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_aram.axis());  
   num_spatial_axes_ = num_axes - first_spatial_axis;  
   // 是否需要强制n维卷积  
   force_nd_im2col_ = conv_param.force_nd_im2col();  
   
   // 如果是正方形的那么  
   kernel_shape_data[0]和[1]=conv_param.kernel_size(0)  
   stride_data[0]和[1] = conv_param.stride(0)  
   pad_data[0]和[1] = conv_param.pad[0]  
   
   // 输入的图像的通道数  
   conv_in_channels_ = channels_;  
    
   // 经过卷积之后的通道数   
   conv_out_channels_ = num_output_;  
   bias_term_ = 1或者0  
     
   // 一个kernel大小的图像块的维度是，输入图像的通道数乘以kernel的长度和宽度  
   kernel_dim_ = input channels per-group x kernel height x kernel width  
   
   // 卷积进行分组的offset  
   weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;  
   
   // 批数  
   num_ = batchsize  
   
   // 卷积层的输入的图像的形状,  batchsize x input image channel x input image height x input image width  
   bottom_shape_ = &bottom[0]->shape();  
   
     
   conv_out_spatial_dim_ = 输出图像的长度和宽度  
   
   // 就是一个kernel的图像块中像素的个数  
   kernel_dim_ = input channels per-group x kernel height x kernel width    
   
   
   // col_buffe_shape压入的是经过im2col处理之后的形状  
   col_buf_shape = kernel_dim_ x   

输入的大小 bottom_dim_ = conv_in_channel X in_height X in_width

输出的大小 top_dim_ = conv_out_channel X out_height X out_width

卷积核输入的图像大小num_kernels_im2col_ = conv_in_channel X out_height X out_width

卷积核输出的图像大小num_kernels_col2im_ = conv_out_channel X out_height X out_width

将偏置扩展成矩阵的bias_multiplier_

weight_shape = [conv_out_channels_, conv_in_channels_/group, weight_h, weight_w]

conv_input_shape_data = [inchannel, in_height, in width]

kernel_shape_=[kernel_height, kernel_width]

3）成员函数

在给出上述的变量以后，不得不介绍一下caffe里面究竟是如何实现卷积的，在介绍成员函数这一小节，我们从调用的顺序讲起是如何实现卷积以及反传的。

ConvolutionLayer是继承于BaseConvolutionLayer的，而BaseConvolutionLayer才是真正实现卷积及其反传的，而在BaseConvolutionLayer中的卷积的实现中有一个重要的函数就是im2col以及col2im，im2colnd以及col2imnd。前面的两个函数是二维卷积的正向和逆向过程，而后面的两个函数是n维卷积的正向和逆向过程。

首先放出conv_layer.cpp的前传代码

[cpp] view plaincopy  
 template <typename Dtype>  
 void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,  
       const vector<Blob<Dtype>*>& top) {  
   const Dtype* weight = this->blobs_[0]->cpu_data();  
   for (int i = 0; i < bottom.size(); ++i) {  
     // bottom_data指向数组，此数组以一张图像为单位
     const Dtype* bottom_data = bottom[i]->cpu_data();  
     Dtype* top_data = top[i]->mutable_cpu_data();  
     // num_ = batchsize  
     for (int n = 0; n < this->num_; ++n) {  
       // 基类的forward_cpu_gemm函数  
       // 计算的是top_data[n * this->top_dim_] =  
       // weights X bottom_data[n * this->bottom_dim_]  
       // 输入的是(一个batch中)一幅图像的数据，对应的是这幅图像卷积之后的位置  
       this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,  
           top_data + n * this->top_dim_);  
       if (this->bias_term_) {  
         const Dtype* bias = this->blobs_[1]->cpu_data();  
         this->forward_cpu_bias(top_data + n * this->top_dim_, bias);  
       }  
     }  
   }  
 }  

bottom_data与top_data中存放着输入和输出数据，blobs_[0]存放着权重数据，blobs_[1]存放着偏置信息。

上述的代码中调用了基类的前传函数，那么我们就把基类的前传函数拿出来看看

[cpp] view plaincopy  
 template <typename Dtype>  
 void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,  
     const Dtype* weights, Dtype* output, bool skip_im2col) {  
   const Dtype* col_buff = input;  
   if (!is_1x1_) {  
     if (!skip_im2col) {  
       // 如果没有1x1卷积，也没有skip_im2col  
       // 则使用conv_im2col_cpu对使用卷积核滑动过程中的每一个kernel大小的图像块  
       // 变成一个列向量，形成一个height=kernel_dim_的  
       // width = 卷积后图像heght*卷积后图像width  
       conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());  
     }  
     col_buff = col_buffer_.cpu_data();  
   }  
   
   // 使用caffe的cpu_gemm来进行计算  
   for (int g = 0; g < group_; ++g) {  
       // 分组分别进行计算  
       // conv_out_channels_ / group_是每个卷积组的输出的channel  
       // kernel_dim_ = input channels per-group x kernel height x kernel width  
       // 计算的是output[output_offset_ * g] =  
       // weights[weight_offset_ * g] X col_buff[col_offset_ * g]  
       // weights的形状是 [conv_out_channel x kernel_dim_]  
       // col_buff的形状是[kernel_dim_ x (卷积后图像高度乘以卷积后图像宽度)]  
       // 所以output的形状自然就是conv_out_channel X (卷积后图像高度乘以卷积后图像宽度)  
     caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /  
         group_, conv_out_spatial_dim_, kernel_dim_,  
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,  
         (Dtype)0., output + output_offset_ * g);  
   }  
 }  
   
 template <typename Dtype>  
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,  
     const Dtype* bias) {  
   // output = bias * bias_multiplier_  
   // num_output 与 conv_out_channel是一样的  
   // num_output_ X out_spatial_dim_ = num_output_ X 1    1 X out_spatial_dim_  
   caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,  
       out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),  
       (Dtype)1., output);  
 }  

而积累中又使用了conv_im2col_cpu将卷积核在图像上的滑动转换为了矩阵。

这就是真身：

[cpp] view plaincopy  
 inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {  
     if (!force_nd_im2col_ && num_spatial_axes_ == 2) {  
       im2col_cpu(data, conv_in_channels_,  
           conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],  
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],  
           pad_.cpu_data()[0], pad_.cpu_data()[1],  
           stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);  
     } else {  
       im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),  
           col_buffer_shape_.data(), kernel_shape_.cpu_data(),  
           pad_.cpu_data(), stride_.cpu_data(), col_buff);  
     }  
   }  

调用im2col_cpu的时候输入的参数为

im2col_cpu(一幅图像，输入图像的channel, 输入图像的height, 输入图像的width, kernel的height, kernel的width, pad的height, pad的width, stride的height， stride的width)

其函数原型如下

[cpp] view plaincopy  
 // 将输入的图像首先进行虚假pad(啥叫虚假填充，就是实际没填充，但是目标图像中有了填充的0)  
 // 填充这一步，我们在原图像上并没有做pad,只是在处理后的图像上加上了pad的值  
 // 然后按照channel*kernel_h*kernel_w一列，将一个channel x kernel_h x kernel_w 大小的图像块变成一个列。  
 // 有多少个这样的列呢，这就可以用公式进行计算  
 // 列数 = [(图像高度+2*填充高度-kernel高度)/stride高度+1] * [(图像宽度+2*填充宽度-kernel宽度)/stride宽度+1]  
 // 这个行数就是一个kernel大小的图像块的维度  
 // 这个列数实际上就是kernel在图像上滑动的次数  
 template <typename Dtype>  
 void im2col_cpu(const Dtype* data_im, const int channels,  
     const int height, const int width, const int kernel_h, const int kernel_w,  
     const int pad_h, const int pad_w,  
     const int stride_h, const int stride_w,  
     Dtype* data_col) {  
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;  
   int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;  
   int channels_col = channels * kernel_h * kernel_w;  
     
   // 遍历一个kernel大小的图像  
   for (int c = 0; c < channels_col; ++c) {  
     // 下面三行是计算在kernel大小的图像上面的位置  
     //  c_im     h_offset     w_offset      
       
     // 在当前的kernel大小的图像上的w  
     int w_offset = c % kernel_w;  
     // 在当前的kernel大小的图像上的h  
     int h_offset = (c / kernel_w) % kernel_h;  
    // 当前kernel大小的图像是第几个channel  
     int c_im = c / kernel_h / kernel_w;  
   
     // 遍历卷积之后的图像的上面的每一个像素  
     for (int h = 0; h < height_col; ++h) {  
       for (int w = 0; w < width_col; ++w) {  
          
         // 计算卷积之后的图像与卷积之前的图像的位置  
    
         // 卷积之后的图像与卷积之前的图像像素所对应的位置  
         // 卷积之后的像素为h和w那么所对应的原图像的位置为 [h * stride_h - pad_h,   h * stride_h - pad_h+kernel_h]以及  
         // [w * stride_w - pad_w,   w * stride_w - pad_w+kernel_w]  
         int h_pad = h * stride_h - pad_h + h_offset;  
         int w_pad = w * stride_w - pad_w + w_offset;  
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)  
           data_col[(c * height_col + h) * width_col + w] =  
             data_im[(c_im * height + h_pad) * width + w_pad];  
         else  
           data_col[(c * height_col + h) * width_col + w] = 0;  
       }  
     }  
   }  
 }  

看起来这段代码很难理解，其实你不妨把计算卷积之后的图像与卷积之前的图像的位置看成是一个函数find_src_location

该函数输入的是h_offset，w_offset

该函数主要就是将每一个子矩形（与kernel一样大小的图像）的(h_offset，w_offset )的位置的像素取出来放到第c行去

[cpp] view plaincopy  
 void find_src_location(h_offset, w_offset)  
 {  
      // 以下是遍历im2col处理之后的图像的一行的像素的位置  
     // 因为卷积之后的图像的列数就是height_col*width_col  
     // 这里两个for循环其实只是处理之后的矩阵的一行  
     for (int h = 0; h < height_col; ++h) {  
       for (int w = 0; w < width_col; ++w) {  
   
           // 将处理后图像的每一个像素找到位于输入图像中的像素值  
         int h_pad = h * stride_h - pad_h + h_offset;  
         int w_pad = w * stride_w - pad_w + w_offset;  
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)  
           data_col[(c * height_col + h) * width_col + w] =  
             data_im[(c_im * height + h_pad) * width + w_pad];  
         else  
           data_col[(c * height_col + h) * width_col + w] = 0;  
       }  
     }  
 }  

这个函数具体是做了啥事了呢，我来画个图

如下图所示，左边的是输入的原始图像，该图像需要进行卷积，右图是经过im2col处理的得到的图像

结合图来说这个函数find_src_location内部的两个for循环干的事情就是将每个不同颜色的小矩形框内部(h_offset，w_offset )位置的像素值复制到

第c行的位置去。比如下图中的右图，第一行的1，就是左图中的每一个不同颜色的小矩形框内的(0,0)位置的1全部复制过来的。

图1 解释im2col的工作过程

上图中输入图像的channel=1,长im_w是9,宽im_h是9,

kernel的h和w都是3,因为kernel是在图像上面滑动，所以kernel_channel=1

假设padding的h和w都是0，也就是不填充

stride_h为3,stide_w=3

那么通过该函数，就可以获得新的矩形

长是：channel*kernel_h*kernel_w = 1*3*3=9

宽是：[(im_h+2*pad_h-kernel_h)/stride_h+1] * [(im_w+2*pad_w-kernel_w)/stride_w+1]=[(9+2*0-3)/3+1]*[(9+2*0-3)/3+1]=9

注意：这个宽呢，就是经过卷积之后的图像的长和宽相乘

含义是什么：

将一个kernel大小的图像块变成图中新的矩形的一个列，一列的长是channel*kernel_h*kernel_w = 1*3*3=9

而宽（列数）则是在图像上的滑动的次数,这里是9，之所以是9并不是巧合，而是取了stride_h和stride_w都为3了。

可是这么干是为了什么呢？

这可以将滑动的卷积变成两个矩形相乘。

怎么讲，假设一个kernel输出的channel=output channel

那么对应的卷积权重W矩阵的形状即为outputchannle x [channel*kernel_h*kernel_w]

而图像经过im2col处理之后的原始图像srcimg变成了[channel*kernel_h*kernel_w] x [卷积之后的图像的长和宽相乘 ]

W x srcimg = outputchannle x 卷积之后的图像的长和宽相乘

怎么样，这下理解了吧

很机智地将卷积变成了两个矩阵相乘了。。。

而对应的col2im的代码就很类似了与im2col 的代码几乎没有啥差别就是这个下面的赋值语句的位置颠倒了一下

[cpp] view plaincopy  
 template <typename Dtype>  
 void col2im_cpu(const Dtype* data_col, const int channels,  
     const int height, const int width, const int patch_h, const int patch_w,  
     const int pad_h, const int pad_w,  
     const int stride_h, const int stride_w,  
     Dtype* data_im) {  
   caffe_set(height * width * channels, Dtype(0), data_im);  
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;  
   int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;  
   int channels_col = channels * patch_h * patch_w;  
   for (int c = 0; c < channels_col; ++c) {  
     int w_offset = c % patch_w;  
     int h_offset = (c / patch_w) % patch_h;  
     int c_im = c / patch_h / patch_w;  
     for (int h = 0; h < height_col; ++h) {  
       for (int w = 0; w < width_col; ++w) {  
         int h_pad = h * stride_h - pad_h + h_offset;  
         int w_pad = w * stride_w - pad_w + w_offset;  
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)  
           data_im[(c_im * height + h_pad) * width + w_pad] +=  
               data_col[(c * height_col + h) * width_col + w];  
       }  
     }  
   }  
 }  

上面介绍了二维卷积，那么我们就乘热打铁，再看看n维通用卷积是如何实现的

接下来介绍n维通用的卷积的具体实现

n维卷积的实现与二维卷积的实现很类似，只不过对应的变量进行了变化，你只需要找到对应就可以很快理解

d_offset 对应于im2col中的h_offset和w_offset是一个输入图像的channel 乘以kernel_size大小的图像块的偏移量(kernel_size下面的代码有定义)

d_iter对应于im2col中内层for循环的h和w，是经过im2colnd处理过的col_buff中的偏移

d_pad对应于im2col中内层for循环的h_pad和w_pad，是输入的原始图像中的偏移

作者还将im2colnd和col2imnd合并到一起实现了，通过const bool im2col来判断是im2col还是col2im

[cpp] view plaincopy  
 // n维通用im2col以及col2im的实现  
 // 作者两个功能一起实现了  
 template <typename Dtype>  
 inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,  
     const int num_spatial_axes, const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     Dtype* data_output) {  
   
   // 如果不是im2col则表明是col2im,也就是说data_output是需要输出的原始图像大小的数据  
   if (!im2col) {  
     int im_size = im_shape[0];  
     for (int i = 0; i < num_spatial_axes; ++i) {  
       im_size *= im_shape[1 + i];  
     }  
     caffe_set(im_size, Dtype(0), data_output);  
   }  
   
   // 一个kernel大小的块有多大  
   int kernel_size = 1;  
   for (int i = 0; i < num_spatial_axes; ++i) {  
     kernel_size *= kernel_shape[i];  
   }  
   
   // channels_col = inputchannel(输入图像的channel)*kernel_size  
   const int channels_col = col_shape[0];  
   
   // 类似于im2col中的w_offset和h_offset，只不过因为这里是n维，所以用数组表示  
   vector<int> d_offset(num_spatial_axes, 0);  
   
   // 类似于im2col中w和h，是col_buff中的偏移  
   vector<int> d_iter(num_spatial_axes, 0);  
   
   
   for (int c = 0; c < channels_col; ++c) {  
   
     // Loop over spatial axes in reverse order to compute a per-axis offset.  
     // 计算n维kernel上的offset,与im2col中对应的代码一样的道理  
     // 只不过这里是n维了，所以用d_offset来表示  
     // 注意，这里用逆序来进行计算得到每个轴的偏移  
     int offset = c;  
     for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {  
       if (d_i < num_spatial_axes - 1) {  
         offset /= kernel_shape[d_i + 1];  
       }  
       d_offset[d_i] = offset % kernel_shape[d_i];  
     }  
   
   
     for (bool incremented = true; incremented; ) {  
       // Loop over spatial axes in forward order to compute the indices in the  
       // image and column, and whether the index lies in the padding.  
       // 是经过im2colnd变换之后的索引  
       int index_col = c;  
   
       // index_im是原始图像中的channel  
       // c = channel * kernel_size  
       int index_im = c / kernel_size;  
   
   
       bool is_padding = false;  
       for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {  
   
         // d是col_buff上的偏移，与d_pad相对(d_pad是原始图像上的偏移)  
         const int d = d_iter[d_i];  
   
         // 在d_pad是经过pad之后的col_buff中的坐标经过转换成原图中的坐标  
         const int d_pad = d * stride[d_i] - pad[d_i] + d_offset[d_i];  
   
         // 判断经过im2colnd处理的图像上的像素是否位于输入的n维图像的上的pad的那个部分  
         is_padding |= d_pad < 0 || d_pad >= im_shape[d_i + 1];  
   
         // 计算位于col_buff中的位置(就是经过im2colnd变换之后的)  
         index_col *= col_shape[d_i + 1];  
         index_col += d;  
   
         // 计算位于原始图像中的位置  
         index_im *= im_shape[d_i + 1];  
         index_im += d_pad;  
       }  
       if (im2col) {  
         if (is_padding) {// 如果是位于pad的部分则设置为0  
           data_output[index_col] = 0;  
         } else {  
           data_output[index_col] = data_input[index_im];  
         }  
       } else if (!is_padding) {  // col2im  
         data_output[index_im] += data_input[index_col];  
       }  
   
       // 更新位于col_buff上的偏移d(d_iter就是所有的d存进去的)  
       // Loop over spatial axes in reverse order to choose an index,  
       // like counting.  
       incremented = false;  
       for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {  
         const int d_max = col_shape[d_i + 1];  
         DCHECK_LT(d_iter[d_i], d_max);  
         if (d_iter[d_i] == d_max - 1) {  
           d_iter[d_i] = 0;  
         } else {  // d_iter[d_i] < d_max - 1  
           ++d_iter[d_i];  
           incremented = true;  
           break;  
         }  
       }  
     }  // while(incremented) {  
   }  // for (int c = 0; c < channels_col; ++c) {  
 }  

给出包裹im2col_nd_core_cpu 的im2col_nd_cpu 函数

kIm2Col=true，输入是data_im，输出是data_col

[cpp] view plaincopy  
 // im2col_nd_cpu只是将kIm2Col=true然后调用im2col_nd_core_cpu  
 template <typename Dtype>  
 void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     Dtype* data_col) {  
   const bool kIm2Col = true;  
   im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,  
                   kernel_shape, pad, stride, data_col);  
 }  

给出包裹im2col_nd_core_cpu的col2im_nd_cpu函数

一个德行，只不过kIm2Col = false了，此外输入的书data_col而输出的是data_im

[cpp] view plaincopy  
 template <typename Dtype>  
 void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     Dtype* data_im) {  
   const bool kIm2Col = false;  
   im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,  
                      kernel_shape, pad, stride, data_im);  
 }  

还有几个显式的声明，定义float和double类型的函数，下面一并贴出

[cpp] view plaincopy  
 // 显式声明float和double类型的im2col_cpu  
 template void im2col_cpu<float>(const float* data_im, const int channels,  
     const int height, const int width, const int kernel_h, const int kernel_w,  
     const int pad_h, const int pad_w, const int stride_h,  
     const int stride_w, float* data_col);  
 template void im2col_cpu<double>(const double* data_im, const int channels,  
     const int height, const int width, const int kernel_h, const int kernel_w,  
     const int pad_h, const int pad_w, const int stride_h,  
     const int stride_w, double* data_col);  

// 显式声明float和double类型的im2col_nd_cpu

[cpp] view plaincopy  
 template void im2col_nd_cpu<float>(const float* data_im,  
     const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     float* data_col);  
 template void im2col_nd_cpu<double>(const double* data_im,  
     const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     double* data_col);  

// 显式声明float和double类型的col2im_cpu

[cpp] view plaincopy  
 template void col2im_cpu<float>(const float* data_col, const int channels,  
     const int height, const int width, const int patch_h, const int patch_w,  
     const int pad_h, const int pad_w, const int stride_h,  
     const int stride_w, float* data_im);  
 template void col2im_cpu<double>(const double* data_col, const int channels,  
     const int height, const int width, const int patch_h, const int patch_w,  
     const int pad_h, const int pad_w, const int stride_h,  
     const int stride_w, double* data_im);  

// 显式声明float和double类型的col2im_nd_cpu

[cpp] view plaincopy  
 template void col2im_nd_cpu<float>(const float* data_col,  
     const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     float* data_im);  
 template void col2im_nd_cpu<double>(const double* data_col,  
     const int num_spatial_axes,  
     const int* im_shape, const int* col_shape,  
     const int* kernel_shape, const int* pad, const int* stride,  
     double* data_im);  

三、总结

一开始看的时候，看懵了，没看懂im2col，后来查了下matlab的im2col，发现原来做了这么一件事情，就是给定一个大小的窗口之后，im2col会将窗口内的元素flat成一个列向量，而列向量的个数则是这个大小的窗口在图像上的滑动的次数（这个滑动的次数实际上也是经过卷积之后的图像中的像素个数，不知道你理解没:)）。

窥一斑难见全豹，详细的注释请在http://download.csdn.net/detail/xizero00/9480181下载