1 模型总览
宏观上来说,DETR主要包含三部分:以卷积神经网络为主的骨干网(CNN Backbone)、以TRM(Transformer)为主的特征抽取及交互器以及以FFN为主的分类和回归头,如DETR中build()函数所示。DETR最出彩的地方在于,它摒弃了非端到端的处理过程,如NMS、anchor generation等,以集合预测的方式来端到端建模目标检测过程,并且将Transformer引入到目标检测中,打开新领域的大门)。
def build(args):backbone = build_backbone(args)transformer = build_transformer(args)model = DETR(backbone,# 骨干网transformer,# 重点部分num_classes=81,num_queries=100,# object query数量,作用相当于spatial embeddingaux_loss=args.aux_loss)matcher = build_matcher(args)# 二分图匹配weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}weight_dict['loss_giou'] = args.giou_loss_coeflosses = ['labels', 'boxes', 'cardinality']postprocessors = {'bbox': PostProcess()}return model, criterion, postprocessors
2 DETR的基本流程
- CNN backbone 提取图像的 feature
- Transformer Encoder 通过 self-attention 建模全局关系对 feature 进行增强
- Transformer Decoder 的输入是 object queries(spatial embedding) 和 Transformer encoder 的输出(content embedding),主要包含 self-attention 和 cross-attention 的过程。Self-attention 主要是对每个 query 之间做交互,让每个 query 能看到其他 query 在查询什么东西,从而不重复,类似与 NMS 的作用;Cross- attention 主要是将 object query 当做查询,encoder feature 当做 key,为了查询和 query 有关的区域。
- 对 Decoder 输出的查询好了的 query,使用 FFN 提取出目标框的位置和类别信息
顺着上边的基本流程,从代码入手一点点理解原文的思想,下面开始!
3 backbone
首先是构建backbone模块的函数
def build_backbone(args):position_embedding = build_position_encoding(args)backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)model = Joiner(backbone, position_embedding)model.num_channels = backbone.num_channelsreturn model
3.1 根据特征图生成位置编码
def build_position_encoding(args):N_steps = args.hidden_dim // 2if args.position_embedding in ('v2', 'sine'):# TODO find a better way of exposing other argumentsposition_embedding = PositionEmbeddingSine(N_steps, normalize=True)elif args.position_embedding in ('v3', 'learned'):position_embedding = PositionEmbeddingLearned(N_steps)else:raise ValueError(f"not supported {args.position_embedding}")return position_embedding
对于输入特征x,假定其尺寸为BxCxHxW,位置编码需要在H W两个维度上进行位置编码,所以一般会将hidden_dim(C, 通道)切分为两部分,一部分代表H另一部分代表W,最后在通道维度上进行拼接。DETR中位置编码主要是sine和learning position embedding。我自己仿照TRM写了一个可学习位置编码的实现,可以运行试试
# 验证learnable pos embedding机制
x = torch.randn((8, 3, 32, 32))
h,w=x.shape[-2:]
row_embed,col_embed=nn.Embedding(50,256),nn.Embedding(50,256)i,j=torch.arange(w,device=x.device),torch.arange(h,device=x.device)
x_emb,y_emb=col_embed(i),row_embed(j)
x_cat=x_emb.unsqueeze(0).repeat(h,1,1)
y_cat=y_emb.unsqueeze(1).repeat(1,w,1)pos=torch.cat([x_cat,y_cat],dim=-1)
pos_learn=pos.permute(2,0,1).unsqueeze(0).repeat(x.shape[0],1,1,1)# shape:(8,512,32,32)
构建backbone
这部分不太难,相关注释已经写在代码块中
class BackboneBase(nn.Module):def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):super().__init__()for name, parameter in backbone.named_parameters():if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:parameter.requires_grad_(False)if return_interm_layers:# 是否返回中间层,在多尺度融合操作时会用到return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}else:# 一般情况下只返回最后一层的输出return_layers = {'layer4': "0"}# IntermediateLayerGetter作用类似于Sequential,将多个神经层组合并可以指定返回中间输出self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)self.num_channels = num_channelsdef forward(self, tensor_list: NestedTensor):xs = self.body(tensor_list.tensors)# 将数据传入网络,实例化网络得到输出,xs即为经过resnet四部分后的输出out: Dict[str, NestedTensor] = {}# 定义输出格式for name, x in xs.items():# 如果返回中间层,out可以按照name存储,返回最后一层则只有layer4m = tensor_list.maskassert m is not Nonemask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]# 利用插值法产生不同尺度下的maskout[name] = NestedTensor(x, mask)return out
将上述二者对应组合
class Joiner(nn.Sequential):def __init__(self, backbone, position_embedding):super().__init__(backbone, position_embedding)def forward(self, tensor_list: NestedTensor):xs = self[0](tensor_list)# [0]代表backbone的输出out: List[NestedTensor] = []pos = []for name, x in xs.items():out.append(x)# position encodingpos.append(self[1](x).to(x.tensors.dtype))# [1]代表position_embeddingreturn out, pos# 返回抽取后的特征及对应的位置编码
Transformer
TRM部分其实跟Attention is all you need的模型结构完全相同,不同的部分只是decoder部分输入。在原始transformer中,decoder的输入是对应目标序列融合位置编码后的embedding,而在本文中,则使用初始化为全0的tgt作为目标序列,然后再融合query_embed。这里非常容易混淆的一点是:tgt全零序列才是content embedding, 代码中的query_embed是代表目标框集合位置的spatial embedding
由于encoder部分主要是特征提取,对边界定位影响不大,所以我们考虑decoder的cross-attention部分,其输入主要包括三部分:query key value
- queries:每个 query 都是 decoder 第一层 self-attention 的输出( content query )+ object query( spatial query ),这里的 object query 就是 DETR 中提出的概念,每个 object query 都是候选框的信息,经过 FFN 后能输出位置和类别信息(本文 object query 个数 N 为 100)
- keys:每个 key 都是 encoder 的输出特征( content key ) + 位置编码( spatial key )构成
- values:只有来自 encoder 的输出
class Transformer(nn.Module):def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,activation="relu", normalize_before=False,return_intermediate_dec=False):super().__init__()encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,dropout, activation, normalize_before)encoder_norm = nn.LayerNorm(d_model) if normalize_before else Noneself.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,dropout, activation, normalize_before)decoder_norm = nn.LayerNorm(d_model)self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,return_intermediate=return_intermediate_dec)self.d_model = d_modelself.nhead = nheaddef forward(self, src, mask, query_embed, pos_embed):# flatten NxCxHxW to HWxNxCbs, c, h, w = src.shapesrc = src.flatten(2).permute(2, 0, 1)# flatten(k)表示将[k:n-1]拉平为一个维度pos_embed = pos_embed.flatten(2).permute(2, 0, 1)# sine# num_queries x hidden_dim to num_queries x N x hidden_dimquery_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)# NxHxW to NxHWmask = mask.flatten(1)# decoder embedding,初始化为全0tgt = torch.zeros_like(query_embed)# encoder特征抽取,得到memory,shape同tgtmemory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)# decoder特征交互,得到hshs = self.decoder(tgt, memory, memory_key_padding_mask=mask,pos=pos_embed, query_pos=query_embed)return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
DETR
class DETR(nn.Module):""" This is the DETR module that performs object detection """def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):super().__init__()self.num_queries = num_queries# object query numsself.transformer = transformerhidden_dim = transformer.d_model# 隐层维度self.class_embed = nn.Linear(hidden_dim, num_classes + 1)# 分类头,最后的类别为:类别数+背景self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)# 检测头,使用三层全连接层进行映射,最后投影到xywhself.query_embed = nn.Embedding(num_queries, hidden_dim)# object queryself.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)# 将得到的feature map通道归一self.backbone = backboneself.aux_loss = aux_lossdef forward(self, samples: NestedTensor):if isinstance(samples, (list, torch.Tensor)):samples = nested_tensor_from_tensor_list(samples)features, pos = self.backbone(samples)# 得到的feature可能是C3-C5几层,DETR只拿最后一层输入TRMsrc, mask = features[-1].decompose()assert mask is not None# self.transformer()[0]表示取dncoder的输出,序列1表示encoder输出hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]outputs_class = self.class_embed(hs)outputs_coord = self.bbox_embed(hs).sigmoid()out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}if self.aux_loss:out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)return out