本篇文章将介绍一个新的改进机制——HTB,并阐述如何将其应用于YOLOv11中,显著提升模型性能。在现代计算机视觉任务中,尤其是在目标检测领域,YOLO系列模型因其快速和准确的检测性能而备受关注。随着YOLOv11的提出,我们迎来了更强大的特征提取能力和更高效的检测架构。然而,如何在复杂的天气条件下(如雨、雾、雪)保持高质量的检测结果仍然是一个挑战。为了解决这一问题,我们尝试将Histogram Transformer Block(HTB)与YOLOv11结合,以提高模型在天气退化图像上的鲁棒性和检测性能。
1. Histogram Transformer Block(HTB)结构介绍
Histogram Transformer Block(HTB)是一个专门为解决恶劣天气图像退化问题设计的模块,其核心思想是利用动态范围直方图自注意力机制来对图像中的退化区域进行有效处理。HTB是Histoformer模型的关键组成部分,主要包括两个模块:动态范围直方图自注意力(DHSA)和双尺度门控前馈网络(DGFF),它们共同用于从受天气影响的图像中提取特征。
-
动态范围直方图自注意力(DHSA):
- 直方图分类:根据像素的强度值将其分类到不同的直方图箱中,从而在这些强度基础的箱内和箱间应用自注意力机制。这种方法提高了注意力机制,使其能够集中关注具有相似退化模式的区域(例如,雨、雾或雪造成的影响)。
- 动态范围卷积:在应用注意力之前,通过对特征进行垂直和水平排序来重新组织空间特征。这种重组方式使卷积操作能够更好地关注动态分布的退化模式,而不是固定的邻近像素。
-
双尺度门控前馈网络(DGFF):
- DGFF模块集成了两条独立的多尺度和多范围深度卷积路径,以增强特征提取能力。在特征转换过程中,使用5×5和膨胀3×3的深度卷积来提取多范围和多尺度信息。通过门控机制,将第二个分支的输出作为另一个分支的门控图,从而实现特征融合。
2. YOLOv11与HTB的结合
1. backbone引入:我们在YOLOv11的主干网络中C2PSA模块中插入DHSA模块,以代替传统的attention。使其能够集中关注具有相似退化模式的区域(例如,雨、雾或雪造成的影响)。
2. backbone引入: 我们在YOLOv11的主干网络中C2PSA模块中插入HTB模块,以代替传统的PSABlock。通过结合动态范围卷积和双分支直方图自注意力,有效地建模了长距离空间特征,使得模型在去除天气退化时表现出更好的性能。
3. head引入:在YOLOv11的预测头部分,我们引入HTB,使得预测头能够更有效地聚焦于退化区域中的重要目标,提升检测精度。
3. Histogram Transformer Block(HTB)代码部分
import numbers
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrangefrom .block import PSABlock, C2PSAConv2d = nn.Conv2d## Layer Norm
def to_2d(x):return rearrange(x, 'b c h w -> b (h w c)')def to_3d(x):
# return rearrange(x, 'b c h w -> b c (h w)')return rearrange(x, 'b c h w -> b (h w) c')def to_4d(x,h,w):
# return rearrange(x, 'b c (h w) -> b c h w',h=h,w=w)return rearrange(x, 'b (h w) c -> b c h w',h=h,w=w)class BiasFree_LayerNorm(nn.Module):def __init__(self, normalized_shape):super(BiasFree_LayerNorm, self).__init__()if isinstance(normalized_shape, numbers.Integral):normalized_shape = (normalized_shape,)normalized_shape = torch.Size(normalized_shape)assert len(normalized_shape) == 1self.normalized_shape = normalized_shapedef forward(self, x):sigma = x.var(-1, keepdim=True, unbiased=False)return x / torch.sqrt(sigma+1e-5) #* self.weightclass WithBias_LayerNorm(nn.Module):def __init__(self, normalized_shape):super(WithBias_LayerNorm, self).__init__()if isinstance(normalized_shape, numbers.Integral):normalized_shape = (normalized_shape,)normalized_shape = torch.Size(normalized_shape)assert len(normalized_shape) == 1self.normalized_shape = normalized_shapedef forward(self, x):mu = x.mean(-1, keepdim=True)sigma = x.var(-1, keepdim=True, unbiased=False)return (x - mu) / torch.sqrt(sigma+1e-5) #* self.weight + self.biasclass LayerNorm(nn.Module):def __init__(self, dim, LayerNorm_type="WithBias"):super(LayerNorm, self).__init__()if LayerNorm_type =='BiasFree':self.body = BiasFree_LayerNorm(dim)else:self.body = WithBias_LayerNorm(dim)def forward(self, x):h, w = x.shape[-2:]return to_4d(self.body(to_3d(x)), h, w)
##########################################################################
## Dual-scale Gated Feed-Forward Network (DGFF)
class FeedForward(nn.Module):def __init__(self, dim, ffn_expansion_factor, bias):super(FeedForward, self).__init__()hidden_features = int(dim * ffn_expansion_factor)self.project_in = Conv2d(dim, hidden_features * 2, kernel_size=1, bias=bias)self.dwconv_5 = Conv2d(hidden_features // 4, hidden_features // 4, kernel_size=5, stride=1, padding=2,groups=hidden_features // 4, bias=bias)self.dwconv_dilated2_1 = Conv2d(hidden_features // 4, hidden_features // 4, kernel_size=3, stride=1, padding=2,groups=hidden_features // 4, bias=bias, dilation=2)self.p_unshuffle = nn.PixelUnshuffle(2)self.p_shuffle = nn.PixelShuffle(2)self.project_out = Conv2d(hidden_features, dim, kernel_size=1, bias=bias)def forward(self, x):x = self.project_in(x)x = self.p_shuffle(x)x1, x2 = x.chunk(2, dim=1)x1 = self.dwconv_5(x1)x2 = self.dwconv_dilated2_1(x2)x = F.mish(x2) * x1x = self.p_unshuffle(x)x = self.project_out(x)return x##########################################################################
##Dynamic-range Histogram Self-Attention (DHSA)
class Attention_histogram(nn.Module):def __init__(self, dim, num_heads=4, bias=False, ifBox=True):super(Attention_histogram, self).__init__()self.factor = num_headsself.ifBox = ifBoxself.num_heads = num_headsself.temperature = nn.Parameter(torch.ones(num_heads, 1, 1))self.qkv = Conv2d(dim, dim * 5, kernel_size=1, bias=bias)self.qkv_dwconv = Conv2d(dim * 5, dim * 5, kernel_size=3, stride=1, padding=1, groups=dim * 5, bias=bias)self.project_out = Conv2d(dim, dim, kernel_size=1, bias=bias)def pad(self, x, factor):hw = x.shape[-1]t_pad = [0, 0] if hw % factor == 0 else [0, (hw // factor + 1) * factor - hw]x = F.pad(x, t_pad, 'constant', 0)return x, t_paddef unpad(self, x, t_pad):_, _, hw = x.shapereturn x[:, :, t_pad[0]:hw - t_pad[1]]def softmax_1(self, x, dim=-1):logit = x.exp()logit = logit / (logit.sum(dim, keepdim=True) + 1)return logitdef normalize(self, x):mu = x.mean(-2, keepdim=True)sigma = x.var(-2, keepdim=True, unbiased=False)return (x - mu) / torch.sqrt(sigma + 1e-5) # * self.weight + self.biasdef reshape_attn(self, q, k, v, ifBox):b, c = q.shape[:2]q, t_pad = self.pad(q, self.factor)k, t_pad = self.pad(k, self.factor)v, t_pad = self.pad(v, self.factor)hw = q.shape[-1] // self.factorshape_ori = "b (head c) (factor hw)" if ifBox else "b (head c) (hw factor)"shape_tar = "b head (c factor) hw"q = rearrange(q, '{} -> {}'.format(shape_ori, shape_tar), factor=self.factor, hw=hw, head=self.num_heads)k = rearrange(k, '{} -> {}'.format(shape_ori, shape_tar), factor=self.factor, hw=hw, head=self.num_heads)v = rearrange(v, '{} -> {}'.format(shape_ori, shape_tar), factor=self.factor, hw=hw, head=self.num_heads)q = torch.nn.functional.normalize(q, dim=-1)k = torch.nn.functional.normalize(k, dim=-1)attn = (q @ k.transpose(-2, -1)) * self.temperatureattn = self.softmax_1(attn, dim=-1)out = (attn @ v)out = rearrange(out, '{} -> {}'.format(shape_tar, shape_ori), factor=self.factor, hw=hw, b=b,head=self.num_heads)out = self.unpad(out, t_pad)return outdef forward(self, x):b, c, h, w = x.shapex_sort, idx_h = x[:, :c // 2].sort(-2)x_sort, idx_w = x_sort.sort(-1)x = x.clone()x[:, :c // 2] = x_sortqkv = self.qkv_dwconv(self.qkv(x))q1, k1, q2, k2, v = qkv.chunk(5, dim=1) # b,c,x,xv, idx = v.view(b, c, -1).sort(dim=-1)q1 = torch.gather(q1.view(b, c, -1), dim=2, index=idx)k1 = torch.gather(k1.view(b, c, -1), dim=2, index=idx)q2 = torch.gather(q2.view(b, c, -1), dim=2, index=idx)k2 = torch.gather(k2.view(b, c, -1), dim=2, index=idx)out1 = self.reshape_attn(q1, k1, v, True)out2 = self.reshape_attn(q2, k2, v, False)out1 = torch.scatter(out1, 2, idx, out1).view(b, c, h, w)out2 = torch.scatter(out2, 2, idx, out2).view(b, c, h, w)out = out1 * out2out = self.project_out(out)out_replace = out[:, :c // 2]out_replace = torch.scatter(out_replace, -1, idx_w, out_replace)out_replace = torch.scatter(out_replace, -2, idx_h, out_replace)out[:, :c // 2] = out_replacereturn out##Histogram Transformer Block (HTB)
class TransformerBlock(nn.Module):def __init__(self, dim, num_heads=4, ffn_expansion_factor=2.5, bias=False, LayerNorm_type='WithBias'):## Other option 'BiasFree'super(TransformerBlock, self).__init__()self.attn_g = Attention_histogram(dim, num_heads, bias, True)self.norm_g = LayerNorm(dim, LayerNorm_type)self.ffn = FeedForward(dim, ffn_expansion_factor, bias)self.norm_ff1 = LayerNorm(dim, LayerNorm_type)def forward(self, x):x = x + self.attn_g(self.norm_g(x))x_out = x + self.ffn(self.norm_ff1(x))return x_out#使用THB中的Dynamic-range HistogramSelf-Attentiion替换 OSABlock中的attention
class PSABlock_DHSA(PSABlock):def __init__(self, c, qk_dim =16 , pdim=32, shortcut=True) -> None:"""Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""super().__init__( c)self.attn = Attention_histogram(c)class C2PSA_DHSA(C2PSA):def __init__(self, c1, c2, n=1, e=0.5):"""Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""super().__init__(c1, c2)assert c1 == c2self.c = int(c1 * e)self.m = nn.Sequential(*(PSABlock_DHSA(self.c, qk_dim =16 , pdim=32) for _ in range(n)))#使用THB替换 OSABlock
class C2PSA_THB(C2PSA):def __init__(self, c1, c2, n=1, e=0.5):"""Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""super().__init__(c1, c2)assert c1 == c2self.c = int(c1 * e)self.m = nn.Sequential(*(TransformerBlock(self.c) for _ in range(n)))if __name__ == '__main__':TB = TransformerBlock(256)#创建一个输入张量batch_size = 8input_tensor=torch.randn(batch_size, 256, 64, 64 )#运行模型并打印输入和输出的形状output_tensor =TB(input_tensor)print("Input shape:",input_tensor.shape)print("0utput shape:",output_tensor.shape)
4. 将HTB引入到YOLOv11中
第一: 将下面的核心代码复制到D:\bilibili\model\YOLO11\ultralytics-main\ultralytics\nn路径下,如下图所示。
第二:在task.py中导入HTB包
第三:在task.py中的模型配置部分下面代码
第一第二改进修改代码的部分
第三改进修改代码的部分
elif m is TransformerBlock :args = [ch[f]]
第四:将模型配置文件复制到YOLOV11.YAMY文件中
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'# [depth, width, max_channels]n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPss: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPsm: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPsl: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPsx: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs# YOLO11n backbone
backbone:# [from, repeats, module, args]- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4- [-1, 2, C3k2, [256, False, 0.25]]- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8- [-1, 2, C3k2, [512, False, 0.25]]- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16- [-1, 2, C3k2, [512, True]]- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32- [-1, 2, C3k2, [1024, True]]- [-1, 1, SPPF, [1024, 5]] # 9- [-1, 2, C2PSA_DHSA, [1024]] # 10# YOLO11n head
head:- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 6], 1, Concat, [1]] # cat backbone P4- [-1, 2, C3k2, [512, False]] # 13- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 4], 1, Concat, [1]] # cat backbone P3- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)- [-1, 1, Conv, [256, 3, 2]]- [[-1, 13], 1, Concat, [1]] # cat head P4- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)- [-1, 1, Conv, [512, 3, 2]]- [[-1, 10], 1, Concat, [1]] # cat head P5- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)- [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'# [depth, width, max_channels]n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPss: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPsm: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPsl: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPsx: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs# YOLO11n backbone
backbone:# [from, repeats, module, args]- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4- [-1, 2, C3k2, [256, False, 0.25]]- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8- [-1, 2, C3k2, [512, False, 0.25]]- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16- [-1, 2, C3k2, [512, True]]- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32- [-1, 2, C3k2, [1024, True]]- [-1, 1, SPPF, [1024, 5]] # 9- [-1, 2, C2PSA_HTB, [1024]] # 10# YOLO11n head
head:- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 6], 1, Concat, [1]] # cat backbone P4- [-1, 2, C3k2, [512, False]] # 13- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 4], 1, Concat, [1]] # cat backbone P3- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)- [-1, 1, Conv, [256, 3, 2]]- [[-1, 13], 1, Concat, [1]] # cat head P4- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)- [-1, 1, Conv, [512, 3, 2]]- [[-1, 10], 1, Concat, [1]] # cat head P5- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)- [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'# [depth, width, max_channels]n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPss: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPsm: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPsl: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPsx: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs# YOLO11n backbone
backbone:# [from, repeats, module, args]- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4- [-1, 2, C3k2, [256, False, 0.25]]- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8- [-1, 2, C3k2, [512, False, 0.25]]- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16- [-1, 2, C3k2, [512, True]]- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32- [-1, 2, C3k2, [1024, True]]- [-1, 1, SPPF, [1024, 5]] # 9- [-1, 2, C2PSA, [1024]] # 10# YOLO11n head
head:- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 6], 1, Concat, [1]] # cat backbone P4- [-1, 2, C3k2, [512, False]] # 13- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 4], 1, Concat, [1]] # cat backbone P3- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)- [-1, 1, TransformerBlock, []]- [-1, 1, Conv, [256, 3, 2]]- [[-1, 13], 1, Concat, [1]] # cat head P4- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)- [-1, 1, TransformerBlock, []]- [-1, 1, Conv, [512, 3, 2]]- [[-1, 10], 1, Concat, [1]] # cat head P5- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)- [-1, 1, TransformerBlock, []]- [[17, 21, 25], 1, Detect, [nc]] # Detect(P3, P4, P5)
第五:运行成功
from ultralytics.models import NAS, RTDETR, SAM, YOLO, FastSAM, YOLOWorldif __name__=="__main__":# 使用自己的YOLOv11.yamy文件搭建模型并加载预训练权重训练模型model = YOLO(r"D:\bilibili\model\YOLO11\ultralytics-main\ultralytics\cfg\models\11\yolo11_HTB.yaml")\.load(r'D:\bilibili\model\YOLO11\ultralytics-main\yolo11n.pt') # build from YAML and transfer weightsresults = model.train(data=r'D:\bilibili\model\ultralytics-main\ultralytics\cfg\datasets\VOC_my.yaml',epochs=100, imgsz=640, batch=8)