http://blog.csdn.net/xizero00/article/details/51049858
一、 卷积层的作用简介
卷积层是深度神经网络中的一个重要的层,该层实现了局部感受野,通过这种局部感受野,可以有效地降低参数的数目。
我们将结合caffe来讲解具体是如何实现卷积层的前传和反传的。至于是如何前传和反传的原理可以参考Notes on Convolutional Neural Networks,具体请百度或者谷歌,即可下载到。
Caffe中的master分支已经将vision_layers.hpp中的各个层分散到layers中去了,因此如果你是主分支的代码,请在include/layers中找BaseConvolutionLayer和ConvolutionLayer的头文件的定义。
二、卷积层的详细介绍
1)构造函数
-
- explicit BaseConvolutionLayer(const LayerParameter& param)
- : Layer<Dtype>(param) {}
2)成员变量
-
-
- Blob<int> kernel_shape_;
-
-
-
- Blob<int> stride_;
-
-
-
- Blob<int> pad_;
-
-
-
- Blob<int> conv_input_shape_;
-
-
-
- vector<int> col_buffer_shape_;
-
-
-
- vector<int> output_shape_;
-
-
- const vector<int>* bottom_shape_;
-
-
- int num_spatial_axes_;
-
-
- int bottom_dim_;
-
-
- int top_dim_;
-
-
- int channel_axis_;
-
-
- int num_;
-
-
- int channels_;
-
-
- int group_;
-
-
- int out_spatial_dim_;
-
-
- int weight_offset_;
-
-
- int num_output_;
-
-
- bool bias_term_;
-
-
- bool is_1x1_;
-
-
- bool force_nd_im2col_;
-
-
- int num_kernels_im2col_;
-
- int num_kernels_col2im_;
-
-
- int conv_out_channels_;
-
-
- int conv_in_channels_;
-
-
- int conv_out_spatial_dim_;
-
-
- int kernel_dim_;
-
-
- int col_offset_;
- int output_offset_;
-
-
- Blob<Dtype> col_buffer_;
-
-
- Blob<Dtype> bias_multiplier_;
为了更为准确地解释,我把代码中的各个变量的含义也贴进来:
- channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_aram.axis());
- num_spatial_axes_ = num_axes - first_spatial_axis;
-
- force_nd_im2col_ = conv_param.force_nd_im2col();
-
-
- kernel_shape_data[0]和[1]=conv_param.kernel_size(0)
- stride_data[0]和[1] = conv_param.stride(0)
- pad_data[0]和[1] = conv_param.pad[0]
-
-
- conv_in_channels_ = channels_;
-
-
- conv_out_channels_ = num_output_;
- bias_term_ = 1或者0
-
-
- kernel_dim_ = input channels per-group x kernel height x kernel width
-
-
- weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
-
-
- num_ = batchsize
-
-
- bottom_shape_ = &bottom[0]->shape();
-
-
- conv_out_spatial_dim_ = 输出图像的长度和宽度
-
-
- kernel_dim_ = input channels per-group x kernel height x kernel width
-
-
-
- col_buf_shape = kernel_dim_ x
输入的大小 bottom_dim_ = conv_in_channel X in_height X in_width
输出的大小 top_dim_ = conv_out_channel X out_height X out_width
卷积核输入的图像大小num_kernels_im2col_ = conv_in_channel X out_height X out_width
卷积核输出的图像大小num_kernels_col2im_ = conv_out_channel X out_height X out_width
将偏置扩展成矩阵的bias_multiplier_
weight_shape = [conv_out_channels_, conv_in_channels_/group, weight_h, weight_w]
conv_input_shape_data = [inchannel, in_height, in width]
kernel_shape_=[kernel_height, kernel_width]
3)成员函数
在给出上述的变量以后,不得不介绍一下caffe里面究竟是如何实现卷积的,在介绍成员函数这一小节,我们从调用的顺序讲起是如何实现卷积以及反传的。
ConvolutionLayer是继承于BaseConvolutionLayer的,而BaseConvolutionLayer才是真正实现卷积及其反传的,而在BaseConvolutionLayer中的卷积的实现中有一个重要的函数就是im2col以及col2im,im2colnd以及col2imnd。前面的两个函数是二维卷积的正向和逆向过程,而后面的两个函数是n维卷积的正向和逆向过程。
首先放出conv_layer.cpp的前传代码
- template <typename Dtype>
- void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- const Dtype* weight = this->blobs_[0]->cpu_data();
- for (int i = 0; i < bottom.size(); ++i) {
-
- const Dtype* bottom_data = bottom[i]->cpu_data();
- Dtype* top_data = top[i]->mutable_cpu_data();
-
- for (int n = 0; n < this->num_; ++n) {
-
-
-
-
- this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
- top_data + n * this->top_dim_);
- if (this->bias_term_) {
- const Dtype* bias = this->blobs_[1]->cpu_data();
- this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
- }
- }
- }
- }
bottom_data与top_data中存放着输入和输出数据,blobs_[0]存放着权重数据,blobs_[1]存放着偏置信息。
上述的代码中调用了基类的前传函数,那么我们就把基类的前传函数拿出来看看
- template <typename Dtype>
- void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
- const Dtype* weights, Dtype* output, bool skip_im2col) {
- const Dtype* col_buff = input;
- if (!is_1x1_) {
- if (!skip_im2col) {
-
-
-
-
- conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
- }
- col_buff = col_buffer_.cpu_data();
- }
-
-
- for (int g = 0; g < group_; ++g) {
-
-
-
-
-
-
-
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
- group_, conv_out_spatial_dim_, kernel_dim_,
- (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
- (Dtype)0., output + output_offset_ * g);
- }
- }
-
- template <typename Dtype>
- void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
- const Dtype* bias) {
-
-
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
- out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
- (Dtype)1., output);
- }
而积累中又使用了conv_im2col_cpu将卷积核在图像上的滑动转换为了矩阵。
这就是真身:
- inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
- if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
- im2col_cpu(data, conv_in_channels_,
- conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
- kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
- pad_.cpu_data()[0], pad_.cpu_data()[1],
- stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
- } else {
- im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
- col_buffer_shape_.data(), kernel_shape_.cpu_data(),
- pad_.cpu_data(), stride_.cpu_data(), col_buff);
- }
- }
调用im2col_cpu的时候输入的参数为
im2col_cpu(一幅图像,输入图像的channel, 输入图像的height, 输入图像的width, kernel的height, kernel的width, pad的height, pad的width, stride的height, stride的width)
其函数原型如下
-
-
-
-
-
-
-
- template <typename Dtype>
- void im2col_cpu(const Dtype* data_im, const int channels,
- const int height, const int width, const int kernel_h, const int kernel_w,
- const int pad_h, const int pad_w,
- const int stride_h, const int stride_w,
- Dtype* data_col) {
- int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
- int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
- int channels_col = channels * kernel_h * kernel_w;
-
-
- for (int c = 0; c < channels_col; ++c) {
-
-
-
-
- int w_offset = c % kernel_w;
-
- int h_offset = (c / kernel_w) % kernel_h;
-
- int c_im = c / kernel_h / kernel_w;
-
-
- for (int h = 0; h < height_col; ++h) {
- for (int w = 0; w < width_col; ++w) {
-
-
-
-
-
-
- int h_pad = h * stride_h - pad_h + h_offset;
- int w_pad = w * stride_w - pad_w + w_offset;
- if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
- data_col[(c * height_col + h) * width_col + w] =
- data_im[(c_im * height + h_pad) * width + w_pad];
- else
- data_col[(c * height_col + h) * width_col + w] = 0;
- }
- }
- }
- }
看起来这段代码很难理解,其实你不妨把计算卷积之后的图像与卷积之前的图像的位置看成是一个函数find_src_location
该函数输入的是h_offset,w_offset
该函数主要就是将每一个子矩形(与kernel一样大小的图像)的(h_offset,w_offset )的位置的像素取出来放到第c行去
- void find_src_location(h_offset, w_offset)
- {
-
-
-
- for (int h = 0; h < height_col; ++h) {
- for (int w = 0; w < width_col; ++w) {
-
-
- int h_pad = h * stride_h - pad_h + h_offset;
- int w_pad = w * stride_w - pad_w + w_offset;
- if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
- data_col[(c * height_col + h) * width_col + w] =
- data_im[(c_im * height + h_pad) * width + w_pad];
- else
- data_col[(c * height_col + h) * width_col + w] = 0;
- }
- }
- }
这个函数具体是做了啥事了呢,我来画个图
如下图所示,左边的是输入的原始图像,该图像需要进行卷积,右图是经过im2col处理的得到的图像
结合图来说这个函数find_src_location内部的两个for循环干的事情就是将每个不同颜色的小矩形框内部(h_offset,w_offset )位置的像素值复制到
第c行的位置去。比如下图中的右图,第一行的1,就是左图中的每一个不同颜色的小矩形框内的(0,0)位置的1全部复制过来的。
图1 解释im2col的工作过程
上图中输入图像的channel=1,长im_w是9,宽im_h是9,
kernel的h和w都是3,因为kernel是在图像上面滑动,所以kernel_channel=1
假设padding的h和w都是0,也就是不填充
stride_h为3,stide_w=3
那么通过该函数,就可以获得新的矩形
长是:channel*kernel_h*kernel_w = 1*3*3=9
宽是:[(im_h+2*pad_h-kernel_h)/stride_h+1] * [(im_w+2*pad_w-kernel_w)/stride_w+1]=[(9+2*0-3)/3+1]*[(9+2*0-3)/3+1]=9
注意:这个宽呢,就是经过卷积之后的图像的长和宽相乘
含义是什么:
将一个kernel大小的图像块变成图中新的矩形的一个列,一列的长是channel*kernel_h*kernel_w = 1*3*3=9
而宽(列数)则是在图像上的滑动的次数,这里是9,之所以是9并不是巧合,而是取了stride_h和stride_w都为3了。
可是这么干是为了什么呢?
这可以将滑动的卷积变成两个矩形相乘。
怎么讲,假设一个kernel输出的channel=output channel
那么对应的卷积权重W矩阵的形状即为outputchannle x [channel*kernel_h*kernel_w]
而图像经过im2col处理之后的原始图像srcimg变成了[channel*kernel_h*kernel_w] x [卷积之后的图像的长和宽相乘 ]
W x srcimg = outputchannle x 卷积之后的图像的长和宽相乘
怎么样,这下理解了吧
很机智地将卷积变成了两个矩阵相乘了。。。
而对应的col2im的代码就很类似了与im2col 的代码几乎没有啥差别就是这个下面的赋值语句的位置颠倒了一下
- template <typename Dtype>
- void col2im_cpu(const Dtype* data_col, const int channels,
- const int height, const int width, const int patch_h, const int patch_w,
- const int pad_h, const int pad_w,
- const int stride_h, const int stride_w,
- Dtype* data_im) {
- caffe_set(height * width * channels, Dtype(0), data_im);
- int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
- int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
- int channels_col = channels * patch_h * patch_w;
- for (int c = 0; c < channels_col; ++c) {
- int w_offset = c % patch_w;
- int h_offset = (c / patch_w) % patch_h;
- int c_im = c / patch_h / patch_w;
- for (int h = 0; h < height_col; ++h) {
- for (int w = 0; w < width_col; ++w) {
- int h_pad = h * stride_h - pad_h + h_offset;
- int w_pad = w * stride_w - pad_w + w_offset;
- if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
- data_im[(c_im * height + h_pad) * width + w_pad] +=
- data_col[(c * height_col + h) * width_col + w];
- }
- }
- }
- }
上面介绍了二维卷积,那么我们就乘热打铁,再看看n维通用卷积是如何实现的
接下来介绍n维通用的卷积的具体实现
n维卷积的实现与二维卷积的实现很类似,只不过对应的变量进行了变化,你只需要找到对应就可以很快理解
d_offset 对应于im2col中的h_offset和w_offset是一个输入图像的channel 乘以kernel_size大小的图像块的偏移量(kernel_size下面的代码有定义)
d_iter对应于im2col中内层for循环的h和w,是经过im2colnd处理过的col_buff中的偏移
d_pad对应于im2col中内层for循环的h_pad和w_pad,是输入的原始图像中的偏移
作者还将im2colnd和col2imnd合并到一起实现了,通过const bool im2col来判断是im2col还是col2im
-
-
- template <typename Dtype>
- inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
- const int num_spatial_axes, const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_output) {
-
-
- if (!im2col) {
- int im_size = im_shape[0];
- for (int i = 0; i < num_spatial_axes; ++i) {
- im_size *= im_shape[1 + i];
- }
- caffe_set(im_size, Dtype(0), data_output);
- }
-
-
- int kernel_size = 1;
- for (int i = 0; i < num_spatial_axes; ++i) {
- kernel_size *= kernel_shape[i];
- }
-
-
- const int channels_col = col_shape[0];
-
-
- vector<int> d_offset(num_spatial_axes, 0);
-
-
- vector<int> d_iter(num_spatial_axes, 0);
-
-
- for (int c = 0; c < channels_col; ++c) {
-
-
-
-
-
- int offset = c;
- for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
- if (d_i < num_spatial_axes - 1) {
- offset /= kernel_shape[d_i + 1];
- }
- d_offset[d_i] = offset % kernel_shape[d_i];
- }
-
-
- for (bool incremented = true; incremented; ) {
-
-
-
- int index_col = c;
-
-
-
- int index_im = c / kernel_size;
-
-
- bool is_padding = false;
- for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
-
-
- const int d = d_iter[d_i];
-
-
- const int d_pad = d * stride[d_i] - pad[d_i] + d_offset[d_i];
-
-
- is_padding |= d_pad < 0 || d_pad >= im_shape[d_i + 1];
-
-
- index_col *= col_shape[d_i + 1];
- index_col += d;
-
-
- index_im *= im_shape[d_i + 1];
- index_im += d_pad;
- }
- if (im2col) {
- if (is_padding) {
- data_output[index_col] = 0;
- } else {
- data_output[index_col] = data_input[index_im];
- }
- } else if (!is_padding) {
- data_output[index_im] += data_input[index_col];
- }
-
-
-
-
- incremented = false;
- for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
- const int d_max = col_shape[d_i + 1];
- DCHECK_LT(d_iter[d_i], d_max);
- if (d_iter[d_i] == d_max - 1) {
- d_iter[d_i] = 0;
- } else {
- ++d_iter[d_i];
- incremented = true;
- break;
- }
- }
- }
- }
- }
给出包裹im2col_nd_core_cpu 的im2col_nd_cpu 函数
kIm2Col=true,输入是data_im,输出是data_col
-
- template <typename Dtype>
- void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_col) {
- const bool kIm2Col = true;
- im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
- kernel_shape, pad, stride, data_col);
- }
给出包裹im2col_nd_core_cpu的col2im_nd_cpu函数
一个德行,只不过kIm2Col = false了,此外输入的书data_col而输出的是data_im
- template <typename Dtype>
- void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_im) {
- const bool kIm2Col = false;
- im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
- kernel_shape, pad, stride, data_im);
- }
还有几个显式的声明,定义float和double类型的函数,下面一并贴出
-
- template void im2col_cpu<float>(const float* data_im, const int channels,
- const int height, const int width, const int kernel_h, const int kernel_w,
- const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, float* data_col);
- template void im2col_cpu<double>(const double* data_im, const int channels,
- const int height, const int width, const int kernel_h, const int kernel_w,
- const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, double* data_col);
// 显式声明float和double类型的im2col_nd_cpu
- template void im2col_nd_cpu<float>(const float* data_im,
- const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- float* data_col);
- template void im2col_nd_cpu<double>(const double* data_im,
- const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- double* data_col);
// 显式声明float和double类型的col2im_cpu
- template void col2im_cpu<float>(const float* data_col, const int channels,
- const int height, const int width, const int patch_h, const int patch_w,
- const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, float* data_im);
- template void col2im_cpu<double>(const double* data_col, const int channels,
- const int height, const int width, const int patch_h, const int patch_w,
- const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, double* data_im);
// 显式声明float和double类型的col2im_nd_cpu
- template void col2im_nd_cpu<float>(const float* data_col,
- const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- float* data_im);
- template void col2im_nd_cpu<double>(const double* data_col,
- const int num_spatial_axes,
- const int* im_shape, const int* col_shape,
- const int* kernel_shape, const int* pad, const int* stride,
- double* data_im);
三、总结
一开始看的时候,看懵了,没看懂im2col,后来查了下matlab的im2col,发现原来做了这么一件事情,就是给定一个大小的窗口之后,im2col会将窗口内的元素flat成一个列向量,而列向量的个数则是这个大小的窗口在图像上的滑动的次数(这个滑动的次数实际上也是经过卷积之后的图像中的像素个数,不知道你理解没:))。
窥一斑难见全豹,详细的注释请在http://download.csdn.net/detail/xizero00/9480181下载
参考:
[1] 别人写的一个博客介绍im2col这个函数具体是怎么实现的,但是我觉得没写明白
http://blog.csdn.net/ayst123/article/details/43924151
[2] matlab中im2col的解释,这给了我启发
http://blog.163.com/liwei_ie/blog/static/20931113220143954730342/