ECO分类模型
ECO 分类模型,可以对视频进行分类,视频是静止画面的集合,并短时间内进行播放,在人眼中形成了视频,通过 FPS 单位进行计算,指的是每秒显示多少张图片。如果直接把图片组合一张大图,随后输入给分类模型是不是可以进行分类呢?由于是静态图片,无法表达出时间的属性,视频中人物的移动、速度等也无法提现。2014 年,研究人员提出了光流的概念,指的是两帧画面中物体移动的速度,速度越快,光流的向量就越长。所以只要观察光流的变化,就可以得到图像中物体移动的开始和结束时间,以及移动速度信息。ECO模型就是实现光流检测的一种方案,他并不是直接将数据放到 C3D的网络,而是首先通过二维卷积生成较小的特征数据,让后再讲这些数据输入 C3D中进行视频处理。
网络结构
ECO网络主要分为以下 4 部分,其中 2D Net和 3D Net 是核心网络。
2D Net
2D Net 包括 4 个字模块,Basic Conv、InceptionA、InceptionB、InceptionC。BasicConv 是卷积层特征变换,输入 3 * 224 * 224,输出 192 * 28 * 28。InceptionA ~ InceptionC进一步进行特征转换,输出尺寸分别为 (256 * 28 * 28)、(256 * 28 * 28)、(96 * 28 * 28)。代码如下:
BasicConv
class BasicConv(nn.Module):'''ECO的2D Net模块中开头的模块'''def __init__(self):super(BasicConv, self).__init__()self.conv1_7x7_s2 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))self.conv1_7x7_s2_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.conv1_relu_7x7 = nn.ReLU(inplace=True)self.pool1_3x3_s2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)self.conv2_3x3_reduce = nn.Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))self.conv2_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.conv2_relu_3x3_reduce = nn.ReLU(inplace=True)self.conv2_3x3 = nn.Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.conv2_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.conv2_relu_3x3 = nn.ReLU(inplace=True)self.pool2_3x3_s2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)def forward(self, x):out = self.conv1_7x7_s2(x)out = self.conv1_7x7_s2_bn(out)out = self.conv1_relu_7x7(out)out = self.pool1_3x3_s2(out)out = self.conv2_3x3_reduce(out)out = self.conv2_3x3_reduce_bn(out)out = self.conv2_relu_3x3_reduce(out)out = self.conv2_3x3(out)out = self.conv2_3x3_bn(out)out = self.conv2_relu_3x3(out)out = self.pool2_3x3_s2(out)return out
InceptionA
class InceptionA(nn.Module):'''InceptionA'''def __init__(self):super(InceptionA, self).__init__()self.inception_3a_1x1 = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3a_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_1x1 = nn.ReLU(inplace=True)self.inception_3a_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3a_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_3x3_reduce = nn.ReLU(inplace=True)self.inception_3a_3x3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3a_3x3_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_3x3 = nn.ReLU(inplace=True)self.inception_3a_double_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3a_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_double_3x3_reduce = nn.ReLU(inplace=True)self.inception_3a_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3a_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_double_3x3_1 = nn.ReLU(inplace=True)self.inception_3a_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3a_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_double_3x3_2 = nn.ReLU(inplace=True)self.inception_3a_pool = nn.AvgPool2d(kernel_size=3, stride=1, padding=1)self.inception_3a_pool_proj = nn.Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1))self.inception_3a_pool_proj_bn = nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3a_relu_pool_proj = nn.ReLU(inplace=True)def forward(self, x):out1 = self.inception_3a_1x1(x)out1 = self.inception_3a_1x1_bn(out1)out1 = self.inception_3a_relu_1x1(out1)out2 = self.inception_3a_3x3_reduce(x)out2 = self.inception_3a_3x3_reduce_bn(out2)out2 = self.inception_3a_relu_3x3_reduce(out2)out2 = self.inception_3a_3x3(out2)out2 = self.inception_3a_3x3_bn(out2)out2 = self.inception_3a_relu_3x3(out2)out3 = self.inception_3a_double_3x3_reduce(x)out3 = self.inception_3a_double_3x3_reduce_bn(out3)out3 = self.inception_3a_relu_double_3x3_reduce(out3)out3 = self.inception_3a_double_3x3_1(out3)out3 = self.inception_3a_double_3x3_1_bn(out3)out3 = self.inception_3a_relu_double_3x3_1(out3)out3 = self.inception_3a_double_3x3_2(out3)out3 = self.inception_3a_double_3x3_2_bn(out3)out3 = self.inception_3a_relu_double_3x3_2(out3)out4 = self.inception_3a_pool(x)out4 = self.inception_3a_pool_proj(out4)out4 = self.inception_3a_pool_proj_bn(out4)out4 = self.inception_3a_relu_pool_proj(out4)outputs = [out1, out2, out3, out4]return torch.cat(outputs, 1)
InceptionB
class InceptionB(nn.Module):'''InceptionB'''def __init__(self):super(InceptionB, self).__init__()self.inception_3b_1x1 = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3b_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_1x1 = nn.ReLU(inplace=True)self.inception_3b_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3b_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_3x3_reduce = nn.ReLU(inplace=True)self.inception_3b_3x3 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3b_3x3_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_3x3 = nn.ReLU(inplace=True)self.inception_3b_double_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3b_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_double_3x3_reduce = nn.ReLU(inplace=True)self.inception_3b_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3b_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_double_3x3_1 = nn.ReLU(inplace=True)self.inception_3b_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3b_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_double_3x3_2 = nn.ReLU(inplace=True)self.inception_3b_pool = nn.AvgPool2d(kernel_size=3, stride=1, padding=1)self.inception_3b_pool_proj = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3b_pool_proj_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3b_relu_pool_proj = nn.ReLU(inplace=True)def forward(self, x):out1 = self.inception_3b_1x1(x)out1 = self.inception_3b_1x1_bn(out1)out1 = self.inception_3b_relu_1x1(out1)out2 = self.inception_3b_3x3_reduce(x)out2 = self.inception_3b_3x3_reduce_bn(out2)out2 = self.inception_3b_relu_3x3_reduce(out2)out2 = self.inception_3b_3x3(out2)out2 = self.inception_3b_3x3_bn(out2)out2 = self.inception_3b_relu_3x3(out2)out3 = self.inception_3b_double_3x3_reduce(x)out3 = self.inception_3b_double_3x3_reduce_bn(out3)out3 = self.inception_3b_relu_double_3x3_reduce(out3)out3 = self.inception_3b_double_3x3_1(out3)out3 = self.inception_3b_double_3x3_1_bn(out3)out3 = self.inception_3b_relu_double_3x3_1(out3)out3 = self.inception_3b_double_3x3_2(out3)out3 = self.inception_3b_double_3x3_2_bn(out3)out3 = self.inception_3b_relu_double_3x3_2(out3)out4 = self.inception_3b_pool(x)out4 = self.inception_3b_pool_proj(out4)out4 = self.inception_3b_pool_proj_bn(out4)out4 = self.inception_3b_relu_pool_proj(out4)outputs = [out1, out2, out3, out4]return torch.cat(outputs, 1)
InceptionC
class InceptionC(nn.Module):'''InceptionC'''def __init__(self):super(InceptionC, self).__init__()self.inception_3c_double_3x3_reduce = nn.Conv2d(320, 64, kernel_size=(1, 1), stride=(1, 1))self.inception_3c_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3c_relu_double_3x3_reduce = nn.ReLU(inplace=True)self.inception_3c_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))self.inception_3c_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.inception_3c_relu_double_3x3_1 = nn.ReLU(inplace=True)def forward(self, x):out = self.inception_3c_double_3x3_reduce(x)out = self.inception_3c_double_3x3_reduce_bn(out)out = self.inception_3c_relu_double_3x3_reduce(out)out = self.inception_3c_double_3x3_1(out)out = self.inception_3c_double_3x3_1_bn(out)out = self.inception_3c_relu_double_3x3_1(out)return out
3D Net
从2D Net的输出(16 * 96 * 28 * 28)进入3D Net,首先将数据进行转换,需要转为时间、高度、宽度。3D Net 包含 4 个字模块,Resnet_3D_3、Resnet_3D_4、Resnet_3D_5,分别进行特征转换,转换后分别为 128 * 16 * 28 *28、256 * 8 * 14 14、512 4 * 7 *7,最终转换为 (512) 特征向量。代码如下:
Resnet_3D_3
class Resnet_3D_3(nn.Module):'''Resnet_3D_3'''def __init__(self):super(Resnet_3D_3, self).__init__()self.res3a_2 = nn.Conv3d(96, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res3a_bn = nn.BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res3a_relu = nn.ReLU(inplace=True)self.res3b_1 = nn.Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res3b_1_bn = nn.BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res3b_1_relu = nn.ReLU(inplace=True)self.res3b_2 = nn.Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res3b_bn = nn.BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res3b_relu = nn.ReLU(inplace=True)def forward(self, x):residual = self.res3a_2(x)out = self.res3a_bn(residual)out = self.res3a_relu(out)out = self.res3b_1(out)out = self.res3b_1_bn(out)out = self.res3b_relu(out)out = self.res3b_2(out)out += residualout = self.res3b_bn(out)out = self.res3b_relu(out)return out
Resnet_3D_4
class Resnet_3D_4(nn.Module):'''Resnet_3D_4'''def __init__(self):super(Resnet_3D_4, self).__init__()self.res4a_1 = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))self.res4a_1_bn = nn.BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res4a_1_relu = nn.ReLU(inplace=True)self.res4a_2 = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res4a_down = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))self.res4a_bn = nn.BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res4a_relu = nn.ReLU(inplace=True)self.res4b_1 = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res4b_1_bn = nn.BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res4b_1_relu = nn.ReLU(inplace=True)self.res4b_2 = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res4b_bn = nn.BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res4b_relu = nn.ReLU(inplace=True)def forward(self, x):residual = self.res4a_down(x)out = self.res4a_1(x)out = self.res4a_1_bn(out)out = self.res4a_1_relu(out)out = self.res4a_2(out)out += residualresidual2 = outout = self.res4a_bn(out)out = self.res4a_relu(out)out = self.res4b_1(out)out = self.res4b_1_bn(out)out = self.res4b_1_relu(out)out = self.res4b_2(out)out += residual2out = self.res4b_bn(out)out = self.res4b_relu(out)return out
Resnet_3D_5
class Resnet_3D_5(nn.Module):'''Resnet_3D_5'''def __init__(self):super(Resnet_3D_5, self).__init__()self.res5a_1 = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))self.res5a_1_bn = nn.BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res5a_1_relu = nn.ReLU(inplace=True)self.res5a_2 = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res5a_down = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))self.res5a_bn = nn.BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res5a_relu = nn.ReLU(inplace=True)self.res5b_1 = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res5b_1_bn = nn.BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res5b_1_relu = nn.ReLU(inplace=True)self.res5b_2 = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))self.res5b_bn = nn.BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)self.res5b_relu = nn.ReLU(inplace=True)def forward(self, x):residual = self.res5a_down(x)out = self.res5a_1(x)out = self.res5a_1_bn(out)out = self.res5a_1_relu(out)out = self.res5a_2(out)out += residual # res5aresidual2 = outout = self.res5a_bn(out)out = self.res5a_relu(out)out = self.res5b_1(out)out = self.res5b_1_bn(out)out = self.res5b_1_relu(out)out = self.res5b_2(out)out += residual2 # res5bout = self.res5b_bn(out)out = self.res5b_relu(out)return out
有兴趣可以从 github 下载模型进行推理。https://github.com/mzolfaghari/ECO-efficient-video-understanding