PaddleVideo：Squeeze Time算法移植

参考
PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideo · GitHubAwesome video understanding toolkits based on PaddlePaddle. It supports video data annotation tools, lightweight RGB and skeleton based action recognition model, practical applications for video tagging and sport action detection. - PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideohttps://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/contribute/add_new_algorithm.md

1：添加backbone：（网络我自己砍了几刀，目的是想和ppTSM-v2做对比）

paddlevideo/modeling/backbones/squeezetime.py

from __future__ import absolute_import, division, print_functionimport paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal,Constant
import paddle.nn.functional as Ffrom ..registry import BACKBONESdef get_inplanes():return [64, 128, 256, 512]class SpatialConv(nn.Layer):"""Inter-temporal Object Interaction Module (IOI)"""def __init__(self, dim_in, dim_out, pos_dim=7):super(SpatialConv, self).__init__()self.short_conv = nn.Conv2D(dim_in, dim_out, kernel_size=3, stride=1, padding=1, groups=1)self.glo_conv = nn.Sequential(nn.Conv2D(dim_in, 16, kernel_size=3, stride=1, padding=1, groups=1),nn.BatchNorm2D(16), nn.ReLU(),nn.Conv2D(16, 16, kernel_size=7, stride=1, padding=3),nn.BatchNorm2D(16), nn.ReLU(),nn.Conv2D(16, dim_out, kernel_size=3, stride=1, padding=1, groups=1), nn.Sigmoid())self.pos_embed = self.create_parameter(shape=[1, 16, pos_dim, pos_dim], default_initializer=nn.initializer.KaimingNormal())def forward(self, x, param):x_short = self.short_conv(x)x = x * paramfor i in range(len(self.glo_conv)):if i == 3:_, _, H, W = x.shapeif self.pos_embed.shape[2] != H or self.pos_embed.shape[3] != W:pos_embed = F.interpolate(self.pos_embed, size=(H, W), mode='bilinear', align_corners=True)else:pos_embed = self.pos_embedx = x + pos_embedx = self.glo_conv[i](x)return x_short * xclass Conv2d(nn.Layer):"""Channel-Time Learning Module (CTL)"""def __init__(self,in_channels: int,out_channels: int,kernel_size: int,stride: int = 1,padding: int = 0,dilation: int = 1,groups: int = 1,bias: bool = True,padding_mode: str = 'zeros', pos_dim = 7):super(Conv2d, self).__init__()self.stride = strideself.param_conv = nn.Sequential(nn.AdaptiveAvgPool2D((1, 1)),nn.Conv2D(in_channels, in_channels, 1, stride=1, padding=1 // 2, bias_attr=False),nn.BatchNorm2D(in_channels),nn.ReLU(),nn.Conv2D(in_channels, in_channels, 1, bias_attr=False),nn.Sigmoid())self.temporal_conv = nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, groups=groups, bias_attr=bias, padding_mode=padding_mode)self.spatial_conv = SpatialConv(dim_in=in_channels, dim_out=out_channels, pos_dim=pos_dim)def forward(self, x):param = self.param_conv(x)x = self.temporal_conv(param * x) + self.spatial_conv(x, param)return xdef conv3x3x3(in_planes, out_planes, stride=1, pos_dim=7):return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False, pos_dim=pos_dim)def conv1x1x1(in_planes, out_planes, stride=1):return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)class BasicBlock(nn.Layer):"""Channel-Time Learning (CTL) Block"""expansion = 1def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):super().__init__()self.conv1 = conv3x3x3(in_planes, planes, stride)self.bn1 = nn.BatchNorm2D(planes)self.relu = nn.ReLU()self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)self.bn2 = nn.BatchNorm2D(planes)self.shortcut_conv = shortcut_convself.stride = strideif stride != 1:self.downsample = nn.Sequential(nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),nn.BatchNorm2D(in_planes))def forward(self, x):if self.stride != 1:x = self.downsample(x)residual = xout = self.conv1(x)out = self.bn1(out)out = self.relu(out)out = self.conv2(out)out = self.bn2(out)if self.shortcut_conv is not None:residual = self.shortcut_conv(x)out += residualout = self.relu(out)return outclass Bottleneck(nn.Layer):"""Channel-Time Learning (CTL) Block"""expansion = 4def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):super().__init__()self.conv1 = conv1x1x1(in_planes, planes)self.bn1 = nn.BatchNorm2D(planes)self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)self.bn2 = nn.BatchNorm2D(planes)self.conv3 = conv1x1x1(planes, planes * self.expansion)self.bn3 = nn.BatchNorm2D(planes * self.expansion)self.relu = nn.ReLU()self.shortcut_conv = shortcut_convself.stride = strideif stride != 1:self.downsample = nn.Sequential(nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),nn.BatchNorm2D(in_planes))def forward(self, x):if self.stride != 1:x = self.downsample(x)residual = xout = self.conv1(x)out = self.bn1(out)out = self.relu(out)out = self.conv2(out)out = self.bn2(out)out = self.relu(out)out = self.conv3(out)out = self.bn3(out)if self.shortcut_conv is not None:residual = self.shortcut_conv(x)out += residualout = self.relu(out)return outclass ResNet(nn.Layer):def __init__(self,block,layers,block_inplanes,n_input_channels=3,no_max_pool=False,shortcut_type='B',widen_factor=1.0,dropout=0.2, freeze_bn=False, spatial_stride=[1,2,2,2], pos_dim=[64,32,16,8]):super().__init__()self.freeze_bn = freeze_bnblock_inplanes = [int(x * widen_factor) for x in block_inplanes]self.in_planes = block_inplanes[0]self.no_max_pool = no_max_poolself.dropout = dropoutself.conv1 = nn.Conv2D(n_input_channels,self.in_planes,kernel_size=5,stride=2,padding=2,groups=1,bias_attr=False)self.bn1 = nn.BatchNorm2D(self.in_planes)self.relu = nn.ReLU()self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],shortcut_type, stride=spatial_stride[0], pos_dim=pos_dim[0])self.layer2 = self._make_layer(block,block_inplanes[1],layers[1],shortcut_type,stride=spatial_stride[1], pos_dim=pos_dim[1])self.layer3 = self._make_layer(block,block_inplanes[2],layers[2],shortcut_type,stride=spatial_stride[2], pos_dim=pos_dim[2])self.layer4 = self._make_layer(block,block_inplanes[3],layers[3],shortcut_type,stride=spatial_stride[3], pos_dim=pos_dim[3])def _downsample_basic_block(self, x, planes, stride):out = F.avg_pool2d(x, kernel_size=1, stride=stride)zero_pads = paddle.zeros([out.shape[0], planes - out.shape[1], out.shape[2], out.shape[3]])if isinstance(out, paddle.CUDAPlace):zero_pads = zero_pads.cuda()out = paddle.concat([out, zero_pads], axis=1)return outdef _make_layer(self, block, planes, blocks, shortcut_type, stride=1, pos_dim=7):shortcut = Noneif self.in_planes != planes * block.expansion:shortcut = nn.Sequential(conv1x1x1(self.in_planes, planes * block.expansion, stride=1),nn.BatchNorm2D(planes * block.expansion))layers = []layers.append(block(in_planes=self.in_planes,planes=planes,stride=stride, shortcut_conv=shortcut, pos_dim=pos_dim))self.in_planes = planes * block.expansionfor i in range(1, blocks):layers.append(block(self.in_planes, planes, pos_dim=pos_dim))return nn.Sequential(*layers)def forward(self, x):print('##################', x.shape)if len(x.shape) == 3:x = paddle.unsqueeze(x, axis=0)N, C, H, W = x.shapex = x.reshape([int(N/16), -1, H, W])x = self.conv1(x)x = self.bn1(x)x = self.relu(x)if not self.no_max_pool:x = self.maxpool(x)x = self.layer1(x)x = self.layer2(x)x = self.layer3(x)x = self.layer4(x)return xdef train(self, mode=True):freeze_bn = self.freeze_bnfreeze_bn_affine = self.freeze_bnsuper(ResNet, self).train(mode)if freeze_bn:print("Freezing Mean/Var of BatchNorm2D.")for m in self.sublayers():if isinstance(m, nn.BatchNorm2D):m.eval()if freeze_bn_affine:print("Freezing Weight/Bias of BatchNorm2D.")for m in self.sublayers():if isinstance(m, nn.BatchNorm2D):m.weight.stop_gradient = Truem.bias.stop_gradient = Truedef SqueezeTime_model(**kwargs):model = ResNet(Bottleneck, [2, 2, 2, 2], get_inplanes(), **kwargs)return model@BACKBONES.register()
def SqueezeTime(pretrained=None, use_ssld=False, **kwargs):"""Build SqueezeTime Model"""model = SqueezeTime_model(widen_factor=0.5, dropout=0.5, n_input_channels=48, freeze_bn=False, spatial_stride=[1, 2, 2, 2], pos_dim=[64, 32, 16, 8])return  model

2：导入backbone：

paddlevideo/modeling/backbones/__init__.py

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.from .actbert import BertForMultiModalPreTraining
from .adds import ADDS_DepthNet
from .agcn import AGCN
from .asrf import ASRF
from .bmn import BMN
from .cfbi import CFBI
from .movinet import MoViNet
from .ms_tcn import MSTCN
from .resnet import ResNet
from .resnet_slowfast import ResNetSlowFast
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .resnet_tsm import ResNetTSM
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tweaks_tsm import ResNetTweaksTSM
from .resnet_tweaks_tsn import ResNetTweaksTSN
from .stgcn import STGCN
from .swin_transformer import SwinTransformer3D
from .transnetv2 import TransNetV2
from .vit import VisionTransformer
from .vit_tweaks import VisionTransformer_tweaks
from .ms_tcn import MSTCN
from .asrf import ASRF
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .cfbi import CFBI
from .ctrgcn import CTRGCN
from .agcn2s import AGCN2s
from .movinet import MoViNet
from .resnet3d_slowonly import ResNet3dSlowOnly
from .toshift_vit import TokenShiftVisionTransformer
from .pptsm_mv2 import PPTSM_MobileNetV2
from .pptsm_mv3 import PPTSM_MobileNetV3
from .pptsm_v2 import PPTSM_v2
from .yowo import YOWO
from .squeezetime import SqueezeTime__all__ = ['ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN','ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2','ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining','ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN','ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN','TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2','PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO', 'SqueezeTime'
]

3：添加head：

paddlevideo/modeling/heads/i2d_head.py

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import paddle
import paddle.nn as nn
from paddle import ParamAttrfrom ..registry import HEADS
from ..weight_init import weight_init_
from .base import BaseHead@HEADS.register()
class I2DHead(BaseHead):"""Classification head for I2D.Args:num_classes (int): Number of classes to be classified.in_channels (int): Number of channels in input feature.loss_cls (dict): Config for building loss.Default: dict(name='CrossEntropyLoss')spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.drop_ratio (float): Probability of dropout layer. Default: 0.5.std (float): Std value for Initiation. Default: 0.01.kwargs (dict, optional): Any keyword argument to be used to initializethe head."""def __init__(self,num_classes,in_channels,loss_cfg=dict(name='CrossEntropyLoss'),spatial_type='avg',drop_ratio=0.5,std=0.01,**kwargs):super().__init__(num_classes, in_channels, loss_cfg, **kwargs)self.spatial_type = spatial_typeself.dropout_ratio = drop_ratioself.init_std = stdif self.dropout_ratio != 0:self.dropout = nn.Dropout(p=self.dropout_ratio)else:self.dropout = Noneself.fc_cls = nn.Linear(self.in_channels, self.num_classes)if self.spatial_type == 'avg':self.avg_pool = nn.AdaptiveAvgPool2D((1, 1))else:self.avg_pool = nn.AdaptiveMaxPool2D((1,1))def forward(self, x, num_segs = None):"""Defines the computation performed at every call.Args:x (Tensor): The input data.Returns:Tensor: The classification scores for input samples."""   # [N, in_channels, 4, 7, 7]if self.avg_pool is not None:x = self.avg_pool(x)# [N, in_channels, 1, 1, 1]if self.dropout is not None:x = self.dropout(x)# [N, in_channels, 1, 1, 1]x = paddle.reshape(x, [x.shape[0], -1])# [N, in_channels]cls_score = self.fc_cls(x)# [N, num_classes]return cls_score# def forward_new(self, x, num_segs = None):#     """Defines the computation performed at every call.#     Args:#         x (Tensor): The input data.#     Returns:#         Tensor: The classification scores for input samples.#     """      #     # [N, in_channels, 4, 7, 7]#     if self.avg_pool is not None:#         x = self.avg_pool(x)#     # [N, in_channels, 1, 1, 1]#     if self.dropout is not None:#         x = self.dropout(x)#     # [N, in_channels, 1, 1, 1]#     x = paddle.reshape(x, [x.shape[0], -1])#     # [N, in_channels]#     cls_score = self.fc_cls(x)#     # [N, num_classes]#     return cls_score

4：导入head：

paddlevideo/modeling/heads/__init__.py

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.from .adds_head import AddsHead
from .asrf_head import ASRFHead
from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead
from .base import BaseHead
from .bbox_head import BBoxHeadAVA
from .cfbi_head import CollaborativeEnsemblerMS
from .i3d_head import I3DHead
from .movinet_head import MoViNetHead
from .ms_tcn_head import MSTCNHead
from .pptimesformer_head import ppTimeSformerHead
from .pptsm_head import ppTSMHead
from .pptsn_head import ppTSNHead
from .roi_head import AVARoIHead
from .single_straight3d import SingleRoIExtractor3D
from .slowfast_head import SlowFastHead
from .stgcn_head import STGCNHead
from .timesformer_head import TimeSformerHead
from .transnetv2_head import TransNetV2Head
from .tsm_head import TSMHead
from .tsn_head import TSNHead
from .ms_tcn_head import MSTCNHead
from .asrf_head import ASRFHead
from .ctrgcn_head import CTRGCNHead
from .movinet_head import MoViNetHead
from .agcn2s_head import AGCN2sHead
from .token_shift_head import TokenShiftHead
from .i2d_head import I2DHead__all__ = ['BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead','AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head','I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead','ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead','MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead','AGCN2sHead', 'I2DHead'
]

5：训练配置文件：

configs/recognition/pptsm/v2/md_ppsqt_16frames_uniform.yaml

MODEL: #MODEL fieldframework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .name: "SqueezeTime" #Mandatory, The name of backbone.head:name: "I2DHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'#pretrained: "" #Optional, pretrained model path.num_classes: 2in_channels: 1024DATASET: #DATASET fieldbatch_size: 16  #Mandatory, bacth sizenum_workers: 4 #Mandatory, the number of subprocess on each GPU.train:format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, train data root pathfile_path: "/home/mnt/sdd/Data/data_fights/train_list.txt" #Mandatory, train data index file pathsuffix: 'img_{:06}.jpg'valid:format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root pathfile_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file pathsuffix: 'img_{:06}.jpg'test:format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root pathfile_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file pathsuffix: 'img_{:06}.jpg'PIPELINE: #PIPELINE fieldtrain: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'decode:name: "FrameDecoder"sample:name: "Sampler"num_seg: 16seg_len: 1valid_mode: Falsetransform: #Mandotary, image transfrom operator- Scale:short_size: 256- MultiScaleCrop:target_size: 256- RandomCrop:target_size: 224- RandomFlip:- Image2Array:- Normalization:mean: [0.485, 0.456, 0.406]std: [0.229, 0.224, 0.225]valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'decode:name: "FrameDecoder"sample:name: "Sampler"num_seg: 16seg_len: 1valid_mode: Truetransform:- Scale:short_size: 256- CenterCrop:target_size: 224- Image2Array:- Normalization:mean: [0.485, 0.456, 0.406]std: [0.229, 0.224, 0.225]test:  #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'decode:name: "FrameDecoder"sample:name: "Sampler"num_seg: 16seg_len: 1valid_mode: Truetransform:- Scale:short_size: 256- CenterCrop:target_size: 224- Image2Array:- Normalization:mean: [0.485, 0.456, 0.406]std: [0.229, 0.224, 0.225]OPTIMIZER: #OPTIMIZER fieldname: 'Momentum'momentum: 0.9learning_rate:iter_step: Truename: 'CustomWarmupCosineDecay'max_epoch: 120warmup_epochs: 10warmup_start_lr: 0.005cosine_base_lr: 0.01weight_decay:name: 'L2'value: 1e-4use_nesterov: TrueMIX:name: "Mixup"alpha: 0.2METRIC:name: 'CenterCropMetric'INFERENCE:name: 'ppSQT_Inference_helper'num_seg: 16target_size: 224model_name: "ppSQT"
log_interval: 10 #Optional, the interal of logger, default:10
epochs: 120  #Mandatory, total epoch
log_level: "INFO" #Optional, the logger level. default: "INFO"

6：训练：

# multi-gpu-st
export CUDA_VISIBLE_DEVICES=0,1
python -B -m paddle.distributed.launch --gpus="0,1"  --log_dir=./log/log_sqt_frame_16  main.py  --validate -c configs/recognition/pptsm/v2/ppsqt_lcnet_md_16frames_uniform.yaml

7：结果：精度比ppTSM-v2低8个点左右。有可能是没有预训练权重的问题。