文章目录
- 1.学习目的
- 2.网络模型![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/67b8dbd00c9b4034ba370fc8b8a6031a.jpeg)
- 3.common.py分析
1.学习目的
YOLOv5中最关键一个模型类
2.网络模型
3.common.py分析
# Ultralytics YOLOv5 🚀, AGPL-3.0 license
"""Common modules."""import ast
import contextlib
import json
import math
import platform
import warnings
import zipfile
from collections import OrderedDict, namedtuple
from copy import copy
from pathlib import Path
from urllib.parse import urlparseimport cv2
import numpy as np
import pandas as pd
import requests
import torch
import torch.nn as nn
from PIL import Image
from torch.cuda import amp# Import 'ultralytics' package or install if missing
try:import ultralyticsassert hasattr(ultralytics, "__version__") # verify package is not directory
except (ImportError, AssertionError):import osos.system("pip install -U ultralytics")import ultralyticsfrom ultralytics.utils.plotting import Annotator, colors, save_one_boxfrom utils import TryExcept
from utils.dataloaders import exif_transpose, letterbox
from utils.general import (LOGGER,ROOT,Profile,check_requirements,check_suffix,check_version,colorstr,increment_path,is_jupyter,make_divisible,non_max_suppression,scale_boxes,xywh2xyxy,xyxy2xywh,yaml_load,
)
from utils.torch_utils import copy_attr, smart_inference_mode# 实现合适的p,使得输出形状和输入一致
def autopad(k, p=None, d=1):"""Pads kernel to 'same' output shape, adjusting for optional dilation; returns padding size.`k`: kernel, `p`: padding, `d`: dilation."""if d > 1:# 卷积核有可能是标量,也可能是列表,当d>1的时候就会调整卷积核尺寸k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-sizeif p is None:# 如果p没有赋值,那么就将p赋值为他的一半并取整p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-padreturn p# 卷积类,继承自父类nn.Module
class Conv(nn.Module):# 卷积 归一化 激活函数"""Applies a convolution, batch normalization, and activation function to an input tensor in a neural network."""# 这里默认激活函数为SiLU函数default_act = nn.SiLU() # default activation# 初始化操作def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):"""Initializes a standard convolution layer with optional batch normalization and activation."""super().__init__()self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)self.bn = nn.BatchNorm2d(c2)# 激活函数的配置,默认是SiLu函数,但是如果是别的也行,再就是保持不变,给的啥就是啥self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()# 前向传播函数def forward(self, x):"""Applies a convolution followed by batch normalization and an activation function to the input tensor `x`."""# 卷积--批量归一化--激活return self.act(self.bn(self.conv(x)))# 融合卷积,不经过BN层直接激活def forward_fuse(self, x):"""Applies a fused convolution and activation function to the input tensor `x`."""return self.act(self.conv(x))# DW卷积 一个纵深的卷积层
class DWConv(Conv):"""Implements a depth-wise convolution layer with optional activation for efficient spatial filtering."""def __init__(self, c1, c2, k=1, s=1, d=1, act=True):"""Initializes a depth-wise convolution layer with optional activation; args: input channels (c1), outputchannels (c2), kernel size (k), stride (s), dilation (d), and activation flag (act)."""super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)# DW卷积实现上采样
class DWConvTranspose2d(nn.ConvTranspose2d):"""A depth-wise transpose convolutional layer for upsampling in neural networks, particularly in YOLOv5 models."""def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):"""Initializes a depth-wise transpose convolutional layer for YOLOv5; args: input channels (c1), output channels(c2), kernel size (k), stride (s), input padding (p1), output padding (p2)."""super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))# 带有多头注意力机制的Transformer层
class TransformerLayer(nn.Module):"""Transformer layer with multihead attention and linear layers, optimized by removing LayerNorm."""def __init__(self, c, num_heads):"""Initializes a transformer layer, sans LayerNorm for performance, with multihead attention and linear layers.See as described in https://arxiv.org/abs/2010.11929."""super().__init__()# 配置q,k,v,ma,fc1,fc2初始值 注意赋两个值self.q = nn.Linear(c, c, bias=False)self.k = nn.Linear(c, c, bias=False)self.v = nn.Linear(c, c, bias=False)self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)self.fc1 = nn.Linear(c, c, bias=False)self.fc2 = nn.Linear(c, c, bias=False)# 前向传播函数def forward(self, x):"""Performs forward pass using MultiheadAttention and two linear transformations with residual connections."""# 多头注意力模块配置,由三个线性层构成x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x# 经过多头注意力模块后,在经过两层线性层x = self.fc2(self.fc1(x)) + xreturn x# 转换模块,肩负视觉任务,位置嵌入以及转换层
class TransformerBlock(nn.Module):"""A Transformer block for vision tasks with convolution, position embeddings, and Transformer layers."""def __init__(self, c1, c2, num_heads, num_layers):"""Initializes a Transformer block for vision tasks, adapting dimensions if necessary and stacking specifiedlayers."""super().__init__()self.conv = Noneif c1 != c2:self.conv = Conv(c1, c2)# 可以学习的位置嵌入self.linear = nn.Linear(c2, c2) # learnable position embedding# 序列化操作-===将多个层连接起来self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))self.c2 = c2# 前向传播 现将每个通道展平与原始通道进行叠加,然后改变通道维度def forward(self, x):"""Processes input through an optional convolution, followed by Transformer layers and position embeddings forobject detection."""if self.conv is not None:x = self.conv(x)b, _, w, h = x.shape# 从第三个维度开始展平 轻量化的操作p = x.flatten(2).permute(2, 0, 1)# 将展平后张量送到神经网络中计算,得到的结果再次转换维度并reshapereturn self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)# Bottleneck类 瓶颈类 核心类
class Bottleneck(nn.Module):# 特色:随机裁剪+分组卷积 == 特征提取"""A bottleneck layer with optional shortcut and group convolution for efficient feature extraction."""# 初始化函数def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):# 初始化一个标准的颈部层,带有随机裁剪以及分组卷积,支持通道拓展"""Initializes a standard bottleneck layer with optional shortcut and group convolution, supporting channelexpansion."""super().__init__()# 定义隐藏层通道数 将c2通道数减半c_ = int(c2 * e) # hidden channels# 定义两个卷积self.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_, c2, 3, 1, g=g)# 叠加标志,当c1与c2相同时就为Trueself.add = shortcut and c1 == c2# 前向传播函数def forward(self, x):"""Processes input through two convolutions, optionally adds shortcut if channel dimensions match; input is atensor."""# 如果叠加标志为True,则将输入域两次卷积后的输出进行叠加,否则只需要两次卷积后的输出return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))# 颈部CSP网络 ==== 特征提取带有交叉空间连接以及随机裁剪
class BottleneckCSP(nn.Module):"""CSP bottleneck layer for feature extraction with cross-stage partial connections and optional shortcuts."""# 初始化函数def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):# 采用随机裁剪初始化CSP瓶颈网络,参数:输入通道数,输出通道数,模块重复个数,剪切标志,分组,拓展标志"""Initializes CSP bottleneck with optional shortcuts; args: ch_in, ch_out, number of repeats, shortcut bool,groups, expansion."""super().__init__()# 获取隐藏层输入通道数c_ = int(c2 * e) # hidden channels# 卷积1self.cv1 = Conv(c1, c_, 1, 1)# 卷积2self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)# 卷积3self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)# 卷积4self.cv4 = Conv(2 * c_, c2, 1, 1)# 归一化层self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)# 激活函数层self.act = nn.SiLU()# 序列化self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))# 前向传播函数 通过运用这些网络层,激活函数,专注于输入x,返回特征增强的输出def forward(self, x):"""Performs forward pass by applying layers, activation, and concatenation on input x, returning feature-enhanced output."""# 先将x进行1*1卷积,然后送入搭建好的神经网络,最后再进行一次卷积,得到通道数减半的输出层y1 = self.cv3(self.m(self.cv1(x)))# 直接将输入进行卷积,得到通道数减半的卷积y2 = self.cv2(x)# 将y1,y2在第二个维度上进行连接,得到通道数叠加的特征图,然后进行归一化处理,最后用SiLu函数激活,再进行最后一次卷积,恢复通道数为c2return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))# 交叉卷积类 下采样,拓展,随机裁剪
class CrossConv(nn.Module):"""Implements a cross convolution layer with downsampling, expansion, and optional shortcut."""# 初始化函数def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):"""Initializes CrossConv with downsampling, expanding, and optionally shortcutting; `c1` input, `c2` outputchannels.Inputs are ch_in, ch_out, kernel, stride, groups, expansion, shortcut."""# 调用父类进行初始化super().__init__()# 获取隐藏层的输入通道数,由c2减半而来c_ = int(c2 * e) # hidden channels# 卷积1,卷积核大小改变,默认(1,3),步长(1,1)self.cv1 = Conv(c1, c_, (1, k), (1, s))# 卷积2,默认卷积核大小为(3,1),步长(1,1)self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g)# 裁剪标志位 只有调用时c1=c2,才会使裁剪标志位值置1self.add = shortcut and c1 == c2def forward(self, x):"""Performs feature sampling, expanding, and applies shortcut if channels match; expects `x` input tensor."""# 根据标志位不同进行不同的操作return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))# C3模块类
class C3(nn.Module):# 用3个卷积层搭建一个CSP瓶颈网络来增强特征提取"""Implements a CSP Bottleneck module with three convolutions for enhanced feature extraction in neural networks."""# 初始化函数def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):"""Initializes C3 module with options for channel count, bottleneck repetition, shortcut usage, groupconvolutions, and expansion."""super().__init__()c_ = int(c2 * e) # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))def forward(self, x):"""Performs forward propagation using concatenated outputs from two convolutions and a Bottleneck sequence."""return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))# C3x模块类 继承了带有交叉卷积的C3模块
class C3x(C3):"""Extends the C3 module with cross-convolutions for enhanced feature extraction in neural networks."""def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):"""Initializes C3x module with cross-convolutions, extending C3 with customizable channel dimensions, groups,and expansion."""# 调用父类方法进行子类参数初始化super().__init__(c1, c2, n, shortcut, g, e)c_ = int(c2 * e)self.m = nn.Sequential(*(CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)))#C3TR模块类
class C3TR(C3):"""C3 module with TransformerBlock for enhanced feature extraction in object detection models."""# 初始化函数 == 用Transformer模块来初始化C3模块def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):"""Initializes C3 module with TransformerBlock for enhanced feature extraction, accepts channel sizes, shortcutconfig, group, and expansion."""super().__init__(c1, c2, n, shortcut, g, e)c_ = int(c2 * e)self.m = TransformerBlock(c_, c_, 4, n)# C3SPP模块===继承C3模块,重写了一个SPP层,以此来进行增强空间特征提取以及通道定制(可控)
class C3SPP(C3):"""Extends the C3 module with an SPP layer for enhanced spatial feature extraction and customizable channels."""def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):"""Initializes a C3 module with SPP layer for advanced spatial feature extraction, given channel sizes, kernelsizes, shortcut, group, and expansion ratio."""super().__init__(c1, c2, n, shortcut, g, e)c_ = int(c2 * e)# 调用下方SPP模块self.m = SPP(c_, c_, k)# C3Ghost模块类 == 实现了一个配置有Ghost瓶颈网络的C3模块===增强特征提取
class C3Ghost(C3):"""Implements a C3 module with Ghost Bottlenecks for efficient feature extraction in YOLOv5."""def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):"""Initializes YOLOv5's C3 module with Ghost Bottlenecks for efficient feature extraction."""super().__init__(c1, c2, n, shortcut, g, e)c_ = int(c2 * e) # hidden channelsself.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))# SPP模块类
class SPP(nn.Module):"""Implements Spatial Pyramid Pooling (SPP) for feature extraction, ref: https://arxiv.org/abs/1406.4729."""# 初始化函数,其中k是一个三个元素构成的元组,后期用来进行三次最大池化操作def __init__(self, c1, c2, k=(5, 9, 13)):"""Initializes SPP layer with Spatial Pyramid Pooling, ref: https://arxiv.org/abs/1406.4729, args: c1 (input channels), c2 (output channels), k (kernel sizes)."""super().__init__()c_ = c1 // 2 # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])def forward(self, x):"""Applies convolution and max pooling layers to the input tensor `x`, concatenates results, and returns outputtensor."""x = self.cv1(x)with warnings.catch_warnings():warnings.simplefilter("ignore") # suppress torch 1.9.0 max_pool2d() warningreturn self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))# SPPF层===快速空间金字塔池化层
class SPPF(nn.Module):"""Implements a fast Spatial Pyramid Pooling (SPPF) layer for efficient feature extraction in YOLOv5 models."""# 初始化SPPF层用给定的通道和卷积核def __init__(self, c1, c2, k=5):"""Initializes YOLOv5 SPPF layer with given channels and kernel size for YOLOv5 model, combining convolution andmax pooling.Equivalent to SPP(k=(5, 9, 13))."""super().__init__()c_ = c1 // 2 # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_ * 4, c2, 1, 1)# 最大池化self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)# 前向传播函数===通过一系列卷积和最大池化操作进行特征提取def forward(self, x):"""Processes input through a series of convolutions and max pooling operations for feature extraction."""# 对输入进行卷积操作x = self.cv1(x)# 捕获异常信息with warnings.catch_warnings():# 抑制一些异常信息warnings.simplefilter("ignore") # suppress torch 1.9.0 max_pool2d() warning# 对输入进行最大池化操作y1 = self.m(x)# 对上面结果再次池化y2 = self.m(y1)# 对x,y1,y2,再次池化的y2进行通道维度连接,然后再次进行池化return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))# Focus类 使用切片与卷积操作来专注于空间信息转化为通道空间
class Focus(nn.Module):"""Focuses spatial information into channel space using slicing and convolution for efficient feature extraction."""# 初始化函数 初始化Focus模块从专注于宽高信息到通道空间def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):"""Initializes Focus module to concentrate width-height info into channel space with configurable convolutionparameters."""super().__init__()# 卷积操作self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)# self.contract = Contract(gain=2)#前向传播函数def forward(self, x):"""Processes input through Focus mechanism, reshaping (b,c,w,h) to (b,4c,w/2,h/2) then applies convolution."""# todo 在通道上面进行切片操作,最终导致宽度与高度减半 ==== 改进点return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))# return self.conv(self.contract(x))# Ghost卷积
class GhostConv(nn.Module):"""Implements Ghost Convolution for efficient feature extraction, see https://github.com/huawei-noah/ghostnet."""#初始化函数def __init__(self, c1, c2, k=1, s=1, g=1, act=True):"""Initializes GhostConv with in/out channels, kernel size, stride, groups, and activation; halves out channelsfor efficiency."""super().__init__()c_ = c2 // 2 # hidden channelsself.cv1 = Conv(c1, c_, k, s, None, g, act=act)self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)def forward(self, x):"""Performs forward pass, concatenating outputs of two convolutions on input `x`: shape (B,C,H,W)."""y = self.cv1(x)# 将输出和输出的卷积进行连接return torch.cat((y, self.cv2(y)), 1)#GhostBottleneck类
class GhostBottleneck(nn.Module):"""Efficient bottleneck layer using Ghost Convolutions, see https://github.com/huawei-noah/ghostnet."""def __init__(self, c1, c2, k=3, s=1):"""Initializes GhostBottleneck with ch_in `c1`, ch_out `c2`, kernel size `k`, stride `s`; see https://github.com/huawei-noah/ghostnet."""super().__init__()c_ = c2 // 2# 初始化一个由多种卷积组成的卷积self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1), # pwDWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dwGhostConv(c_, c2, 1, 1, act=False),) # pw-linear# 初始化一个由多个卷积组成的网络序列构成剪切属性self.shortcut = (nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity())def forward(self, x):"""Processes input through conv and shortcut layers, returning their summed output."""# 构成该网络的模型return self.conv(x) + self.shortcut(x)# Contract模块类 === 降维类
class Contract(nn.Module):"""Contracts spatial dimensions into channel dimensions for efficient processing in neural networks."""def __init__(self, gain=2):"""Initializes a layer to contract spatial dimensions (width-height) into channels, e.g., input shape(1,64,80,80) to (1,256,40,40)."""super().__init__()# 增益初始化self.gain = gain# 前向传播函数def forward(self, x):"""Processes input tensor to expand channel dimensions by contracting spatial dimensions, yielding output shape`(b, c*s*s, h//s, w//s)`."""# 获取张量尺寸b, c, h, w = x.size() # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'# 将gain->strides = self.gain# 进行reshape操作x = x.view(b, c, h // s, s, w // s, s) # x(1,64,40,2,40,2)#再次将张量进行维度调换,并且将内存进行连续操作x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40)# 返回维度被转换的张量return x.view(b, c * s * s, h // s, w // s) # x(1,256,40,40)# 拓展类==拓展空间维度通过重新分配通道数---reshape操作
class Expand(nn.Module):"""Expands spatial dimensions by redistributing channels, e.g., from (1,64,80,80) to (1,16,160,160)."""# 初始化函数def __init__(self, gain=2):"""Initializes the Expand module to increase spatial dimensions by redistributing channels, with an optional gainfactor.Example: x(1,64,80,80) to x(1,16,160,160)."""super().__init__()self.gain = gaindef forward(self, x):"""Processes input tensor x to expand spatial dimensions by redistributing channels, requiring C / gain^2 ==0."""b, c, h, w = x.size() # assert C / s ** 2 == 0, 'Indivisible gain's = self.gainx = x.view(b, s, s, c // s**2, h, w) # x(1,2,2,16,80,80)x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2)return x.view(b, c // s**2, h * s, w * s) # x(1,16,160,160)# Concat类 指定通道维度进行连接
class Concat(nn.Module):"""Concatenates tensors along a specified dimension for efficient tensor manipulation in neural networks."""def __init__(self, dimension=1):"""Initializes a Concat module to concatenate tensors along a specified dimension."""super().__init__()# 维度初始化self.d = dimensiondef forward(self, x):"""Concatenates a list of tensors along a specified dimension; `x` is a list of tensors, `dimension` is anint."""# 张量在指定维度上进行连接return torch.cat(x, self.d)# 检测多个后端模型类
class DetectMultiBackend(nn.Module):"""YOLOv5 MultiBackend class for inference on various backends including PyTorch, ONNX, TensorRT, and more."""def __init__(self, weights="yolov5s.pt", device=torch.device("cpu"), dnn=False, data=None, fp16=False, fuse=True):"""Initializes DetectMultiBackend with support for various inference backends, including PyTorch and ONNX."""# PyTorch: weights = *.pt# TorchScript: *.torchscript# ONNX Runtime: *.onnx# ONNX OpenCV DNN: *.onnx --dnn# OpenVINO: *_openvino_model# CoreML: *.mlpackage# TensorRT: *.engine# TensorFlow SavedModel: *_saved_model# TensorFlow GraphDef: *.pb# TensorFlow Lite: *.tflite# TensorFlow Edge TPU: *_edgetpu.tflite# PaddlePaddle: *_paddle_modelfrom models.experimental import attempt_download, attempt_load # scoped to avoid circular importsuper().__init__()# 获取字符串类型的权重文件名w = str(weights[0] if isinstance(weights, list) else weights)pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w)fp16 &= pt or jit or onnx or engine or triton # FP16nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH)stride = 32 # default stride# 判断cuda是否可用cuda = torch.cuda.is_available() and device.type != "cpu" # use CUDAif not (pt or triton):w = attempt_download(w) # download if not local#如果选择的pytorch模型则进入下面的程序if pt: # PyTorchmodel = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse)# 获取模型步长最大值stride = max(int(model.stride.max()), 32) # model stride# 获取模型名字names = model.module.names if hasattr(model, "module") else model.names # get class names# 将模型数据类型降级model.half() if fp16 else model.float()self.model = model # explicitly assign for to(), cpu(), cuda(), half()elif jit: # TorchScriptLOGGER.info(f"Loading {w} for TorchScript inference...")extra_files = {"config.txt": ""} # model metadatamodel = torch.jit.load(w, _extra_files=extra_files, map_location=device)model.half() if fp16 else model.float()if extra_files["config.txt"]: # load metadata dictd = json.loads(extra_files["config.txt"],object_hook=lambda d: {int(k) if k.isdigit() else k: v for k, v in d.items()},)stride, names = int(d["stride"]), d["names"]elif dnn: # ONNX OpenCV DNNLOGGER.info(f"Loading {w} for ONNX OpenCV DNN inference...")check_requirements("opencv-python>=4.5.4")net = cv2.dnn.readNetFromONNX(w)elif onnx: # ONNX RuntimeLOGGER.info(f"Loading {w} for ONNX Runtime inference...")check_requirements(("onnx", "onnxruntime-gpu" if cuda else "onnxruntime"))import onnxruntimeproviders = ["CUDAExecutionProvider", "CPUExecutionProvider"] if cuda else ["CPUExecutionProvider"]session = onnxruntime.InferenceSession(w, providers=providers)output_names = [x.name for x in session.get_outputs()]meta = session.get_modelmeta().custom_metadata_map # metadataif "stride" in meta:stride, names = int(meta["stride"]), eval(meta["names"])elif xml: # OpenVINOLOGGER.info(f"Loading {w} for OpenVINO inference...")check_requirements("openvino>=2023.0") # requires openvino-dev: https://pypi.org/project/openvino-dev/from openvino.runtime import Core, Layout, get_batchcore = Core()if not Path(w).is_file(): # if not *.xmlw = next(Path(w).glob("*.xml")) # get *.xml file from *_openvino_model dirov_model = core.read_model(model=w, weights=Path(w).with_suffix(".bin"))if ov_model.get_parameters()[0].get_layout().empty:ov_model.get_parameters()[0].set_layout(Layout("NCHW"))batch_dim = get_batch(ov_model)if batch_dim.is_static:batch_size = batch_dim.get_length()ov_compiled_model = core.compile_model(ov_model, device_name="AUTO") # AUTO selects best available devicestride, names = self._load_metadata(Path(w).with_suffix(".yaml")) # load metadataelif engine: # TensorRTLOGGER.info(f"Loading {w} for TensorRT inference...")import tensorrt as trt # https://developer.nvidia.com/nvidia-tensorrt-downloadcheck_version(trt.__version__, "7.0.0", hard=True) # require tensorrt>=7.0.0if device.type == "cpu":device = torch.device("cuda:0")Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))logger = trt.Logger(trt.Logger.INFO)with open(w, "rb") as f, trt.Runtime(logger) as runtime:model = runtime.deserialize_cuda_engine(f.read())context = model.create_execution_context()bindings = OrderedDict()output_names = []fp16 = False # default updated belowdynamic = Falseis_trt10 = not hasattr(model, "num_bindings")num = range(model.num_io_tensors) if is_trt10 else range(model.num_bindings)for i in num:if is_trt10:name = model.get_tensor_name(i)dtype = trt.nptype(model.get_tensor_dtype(name))is_input = model.get_tensor_mode(name) == trt.TensorIOMode.INPUTif is_input:if -1 in tuple(model.get_tensor_shape(name)): # dynamicdynamic = Truecontext.set_input_shape(name, tuple(model.get_profile_shape(name, 0)[2]))if dtype == np.float16:fp16 = Trueelse: # outputoutput_names.append(name)shape = tuple(context.get_tensor_shape(name))else:name = model.get_binding_name(i)dtype = trt.nptype(model.get_binding_dtype(i))if model.binding_is_input(i):if -1 in tuple(model.get_binding_shape(i)): # dynamicdynamic = Truecontext.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))if dtype == np.float16:fp16 = Trueelse: # outputoutput_names.append(name)shape = tuple(context.get_binding_shape(i))im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())batch_size = bindings["images"].shape[0] # if dynamic, this is instead max batch sizeelif coreml: # CoreMLLOGGER.info(f"Loading {w} for CoreML inference...")import coremltools as ctmodel = ct.models.MLModel(w)elif saved_model: # TF SavedModelLOGGER.info(f"Loading {w} for TensorFlow SavedModel inference...")import tensorflow as tfkeras = False # assume TF1 saved_modelmodel = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)elif pb: # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxtLOGGER.info(f"Loading {w} for TensorFlow GraphDef inference...")import tensorflow as tfdef wrap_frozen_graph(gd, inputs, outputs):"""Wraps a TensorFlow GraphDef for inference, returning a pruned function."""x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), []) # wrappedge = x.graph.as_graph_elementreturn x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))def gd_outputs(gd):"""Generates a sorted list of graph outputs excluding NoOp nodes and inputs, formatted as '<name>:0'."""name_list, input_list = [], []for node in gd.node: # tensorflow.core.framework.node_def_pb2.NodeDefname_list.append(node.name)input_list.extend(node.input)return sorted(f"{x}:0" for x in list(set(name_list) - set(input_list)) if not x.startswith("NoOp"))gd = tf.Graph().as_graph_def() # TF GraphDefwith open(w, "rb") as f:gd.ParseFromString(f.read())frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs=gd_outputs(gd))elif tflite or edgetpu: # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_pythontry: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpufrom tflite_runtime.interpreter import Interpreter, load_delegateexcept ImportError:import tensorflow as tfInterpreter, load_delegate = (tf.lite.Interpreter,tf.lite.experimental.load_delegate,)if edgetpu: # TF Edge TPU https://coral.ai/software/#edgetpu-runtimeLOGGER.info(f"Loading {w} for TensorFlow Lite Edge TPU inference...")delegate = {"Linux": "libedgetpu.so.1", "Darwin": "libedgetpu.1.dylib", "Windows": "edgetpu.dll"}[platform.system()]interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)])else: # TFLiteLOGGER.info(f"Loading {w} for TensorFlow Lite inference...")interpreter = Interpreter(model_path=w) # load TFLite modelinterpreter.allocate_tensors() # allocateinput_details = interpreter.get_input_details() # inputsoutput_details = interpreter.get_output_details() # outputs# load metadatawith contextlib.suppress(zipfile.BadZipFile):with zipfile.ZipFile(w, "r") as model:meta_file = model.namelist()[0]meta = ast.literal_eval(model.read(meta_file).decode("utf-8"))stride, names = int(meta["stride"]), meta["names"]elif tfjs: # TF.jsraise NotImplementedError("ERROR: YOLOv5 TF.js inference is not supported")elif paddle: # PaddlePaddleLOGGER.info(f"Loading {w} for PaddlePaddle inference...")check_requirements("paddlepaddle-gpu" if cuda else "paddlepaddle")import paddle.inference as pdiif not Path(w).is_file(): # if not *.pdmodelw = next(Path(w).rglob("*.pdmodel")) # get *.pdmodel file from *_paddle_model dirweights = Path(w).with_suffix(".pdiparams")config = pdi.Config(str(w), str(weights))if cuda:config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)predictor = pdi.create_predictor(config)input_handle = predictor.get_input_handle(predictor.get_input_names()[0])output_names = predictor.get_output_names()elif triton: # NVIDIA Triton Inference ServerLOGGER.info(f"Using {w} as Triton Inference Server...")check_requirements("tritonclient[all]")from utils.triton import TritonRemoteModelmodel = TritonRemoteModel(url=w)nhwc = model.runtime.startswith("tensorflow")else:raise NotImplementedError(f"ERROR: {w} is not a supported format")# class namesif "names" not in locals():names = yaml_load(data)["names"] if data else {i: f"class{i}" for i in range(999)}if names[0] == "n01440764" and len(names) == 1000: # ImageNetnames = yaml_load(ROOT / "data/ImageNet.yaml")["names"] # human-readable namesself.__dict__.update(locals()) # assign all variables to self# 定义前向传播函数def forward(self, im, augment=False, visualize=False):"""Performs YOLOv5 inference on input images with options for augmentation and visualization."""b, ch, h, w = im.shape # batch, channel, height, widthif self.fp16 and im.dtype != torch.float16:im = im.half() # to FP16if self.nhwc:im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3)# 我们用这种模型即可if self.pt: # PyTorchy = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)elif self.jit: # TorchScripty = self.model(im)elif self.dnn: # ONNX OpenCV DNNim = im.cpu().numpy() # torch to numpyself.net.setInput(im)y = self.net.forward()elif self.onnx: # ONNX Runtimeim = im.cpu().numpy() # torch to numpyy = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im})elif self.xml: # OpenVINOim = im.cpu().numpy() # FP32y = list(self.ov_compiled_model(im).values())elif self.engine: # TensorRTif self.dynamic and im.shape != self.bindings["images"].shape:i = self.model.get_binding_index("images")self.context.set_binding_shape(i, im.shape) # reshape if dynamicself.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)for name in self.output_names:i = self.model.get_binding_index(name)self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))s = self.bindings["images"].shapeassert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"self.binding_addrs["images"] = int(im.data_ptr())self.context.execute_v2(list(self.binding_addrs.values()))y = [self.bindings[x].data for x in sorted(self.output_names)]elif self.coreml: # CoreMLim = im.cpu().numpy()im = Image.fromarray((im[0] * 255).astype("uint8"))# im = im.resize((192, 320), Image.BILINEAR)y = self.model.predict({"image": im}) # coordinates are xywh normalizedif "confidence" in y:box = xywh2xyxy(y["coordinates"] * [[w, h, w, h]]) # xyxy pixelsconf, cls = y["confidence"].max(1), y["confidence"].argmax(1).astype(np.float)y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1)else:y = list(reversed(y.values())) # reversed for segmentation models (pred, proto)elif self.paddle: # PaddlePaddleim = im.cpu().numpy().astype(np.float32)self.input_handle.copy_from_cpu(im)self.predictor.run()y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names]elif self.triton: # NVIDIA Triton Inference Servery = self.model(im)else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)im = im.cpu().numpy()if self.saved_model: # SavedModely = self.model(im, training=False) if self.keras else self.model(im)elif self.pb: # GraphDefy = self.frozen_func(x=self.tf.constant(im))else: # Lite or Edge TPUinput = self.input_details[0]int8 = input["dtype"] == np.uint8 # is TFLite quantized uint8 modelif int8:scale, zero_point = input["quantization"]im = (im / scale + zero_point).astype(np.uint8) # de-scaleself.interpreter.set_tensor(input["index"], im)self.interpreter.invoke()y = []for output in self.output_details:x = self.interpreter.get_tensor(output["index"])if int8:scale, zero_point = output["quantization"]x = (x.astype(np.float32) - zero_point) * scale # re-scaley.append(x)y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]y[0][..., :4] *= [w, h, w, h] # xywh normalized to pixelsif isinstance(y, (list, tuple)):return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]else:return self.from_numpy(y)# 张量转numpy函数def from_numpy(self, x):"""Converts a NumPy array to a torch tensor, maintaining device compatibility."""return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else xdef warmup(self, imgsz=(1, 3, 640, 640)):"""Performs a single inference warmup to initialize model weights, accepting an `imgsz` tuple for image size."""warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.tritonif any(warmup_types) and (self.device.type != "cpu" or self.triton):im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # inputfor _ in range(2 if self.jit else 1): #self.forward(im) # warmup# 模型类型类 有预训练权重以及后期正式训练的一些权重@staticmethoddef _model_type(p="path/to/model.pt"):"""Determines model type from file path or URL, supporting various export formats.Example: path='path/to/model.onnx' -> type=onnx"""# types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle]from export import export_formatsfrom utils.downloads import is_url# 导出格式,取前缀sf = list(export_formats().Suffix) # export suffixesif not is_url(p, check=False):check_suffix(p, sf) # checksurl = urlparse(p) # if url may be Triton inference servertypes = [s in Path(p).name for s in sf]types[8] &= not types[9] # tflite &= not edgetputriton = not any(types) and all([any(s in url.scheme for s in ["http", "grpc"]), url.netloc])return types + [triton]# 导入元数据@staticmethoddef _load_metadata(f=Path("path/to/meta.yaml")):"""Loads metadata from a YAML file, returning strides and names if the file exists, otherwise `None`."""if f.exists():d = yaml_load(f)return d["stride"], d["names"] # assign stride, namesreturn None, None# 数据预处理类
class AutoShape(nn.Module):"""AutoShape class for robust YOLOv5 inference with preprocessing, NMS, and support for various input formats."""conf = 0.25 # NMS confidence thresholdiou = 0.45 # NMS IoU thresholdagnostic = False # NMS class-agnosticmulti_label = False # NMS multiple labels per boxclasses = None # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogsmax_det = 1000 # maximum number of detections per imageamp = False # Automatic Mixed Precision (AMP) inference# 初始化操作def __init__(self, model, verbose=True):"""Initializes YOLOv5 model for inference, setting up attributes and preparing model for evaluation."""super().__init__()if verbose:LOGGER.info("Adding AutoShape... ")copy_attr(self, model, include=("yaml", "nc", "hyp", "names", "stride", "abc"), exclude=()) # copy attributesself.dmb = isinstance(model, DetectMultiBackend) # DetectMultiBackend() instanceself.pt = not self.dmb or model.pt # PyTorch modelself.model = model.eval()if self.pt:m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect()m.inplace = False # Detect.inplace=False for safe multithread inferencem.export = True # do not output loss values# 运用函数def _apply(self, fn):"""Applies to(), cpu(), cuda(), half() etc.to model tensors excluding parameters or registered buffers."""self = super()._apply(fn)if self.pt:m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect()m.stride = fn(m.stride)m.grid = list(map(fn, m.grid))if isinstance(m.anchor_grid, list):m.anchor_grid = list(map(fn, m.anchor_grid))return self@smart_inference_mode()def forward(self, ims, size=640, augment=False, profile=False):"""Performs inference on inputs with optional augment & profiling.Supports various formats including file, URI, OpenCV, PIL, numpy, torch."""# For size(height=640, width=1280), RGB images example inputs are:# file: ims = 'data/images/zidane.jpg' # str or PosixPath# URI: = 'https://ultralytics.com/images/zidane.jpg'# OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3)# PIL: = Image.open('image.jpg') or ImageGrab.grab() # HWC x(640,1280,3)# numpy: = np.zeros((640,1280,3)) # HWC# torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values)# multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of imagesdt = (Profile(), Profile(), Profile())with dt[0]:# 尺寸拓展if isinstance(size, int): # expandsize = (size, size)p = next(self.model.parameters()) if self.pt else torch.empty(1, device=self.model.device) # paramautocast = self.amp and (p.device.type != "cpu") # Automatic Mixed Precision (AMP) inferenceif isinstance(ims, torch.Tensor): # torchwith amp.autocast(autocast):return self.model(ims.to(p.device).type_as(p), augment=augment) # inference# Pre-process# 数据预处理===获取数量以及图片集合n, ims = (len(ims), list(ims)) if isinstance(ims, (list, tuple)) else (1, [ims]) # number, list of imagesshape0, shape1, files = [], [], [] # image and inference shapes, filenamesfor i, im in enumerate(ims):f = f"image{i}" # filenameif isinstance(im, (str, Path)): # filename or uriim, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im), imim = np.asarray(exif_transpose(im))elif isinstance(im, Image.Image): # PIL Imageim, f = np.asarray(exif_transpose(im)), getattr(im, "filename", f) or ffiles.append(Path(f).with_suffix(".jpg").name)if im.shape[0] < 5: # image in CHWim = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1)# 取三个通道图像数据,如果是三维图像,否则将灰度图像处理为BGR格式im = im[..., :3] if im.ndim == 3 else cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # enforce 3ch inputs = im.shape[:2] # HWC# 将图像形状数据放到shape0中shape0.append(s) # image shape# 取最大尺寸与图像尺寸的比值g = max(size) / max(s) # gain# shap1列表存放预处理后的图像形状shape1.append([int(y * g) for y in s])# 获取图像信息ims[i] = im if im.data.contiguous else np.ascontiguousarray(im) # updateshape1 = [make_divisible(x, self.stride) for x in np.array(shape1).max(0)] # inf shapex = [letterbox(im, shape1, auto=False)[0] for im in ims] # pad# 维度转换x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHWx = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32with amp.autocast(autocast):# Inferencewith dt[1]:y = self.model(x, augment=augment) # forward# Post-processwith dt[2]:y = non_max_suppression(y if self.dmb else y[0],self.conf,self.iou,self.classes,self.agnostic,self.multi_label,max_det=self.max_det,) # NMSfor i in range(n):scale_boxes(shape1, y[i][:, :4], shape0[i])# 初始化一些图像信息return Detections(ims, y, files, dt, self.names, x.shape)class Detections:"""Manages YOLOv5 detection results with methods for visualization, saving, cropping, and exporting detections."""def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None):"""Initializes the YOLOv5 Detections class with image info, predictions, filenames, timing and normalization."""super().__init__()# 获取设备信息d = pred[0].device # devicegn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims] # normalizationsself.ims = ims # list of images as numpy arrays# 预测框信息self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls)self.names = names # class namesself.files = files # image filenamesself.times = times # profiling timesself.xyxy = pred # xyxy pixelsself.xywh = [xyxy2xywh(x) for x in pred] # xywh pixelsself.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalizedself.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalizedself.n = len(self.pred) # number of images (batch size)self.t = tuple(x.t / self.n * 1e3 for x in times) # timestamps (ms)self.s = tuple(shape) # inference BCHW shapedef _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path("")):"""Executes model predictions, displaying and/or saving outputs with optional crops and labels."""s, crops = "", []for i, (im, pred) in enumerate(zip(self.ims, self.pred)):s += f"\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} " # stringif pred.shape[0]:for c in pred[:, -1].unique():n = (pred[:, -1] == c).sum() # detections per classs += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to strings = s.rstrip(", ")if show or save or render or crop:annotator = Annotator(im, example=str(self.names))for *box, conf, cls in reversed(pred): # xyxy, confidence, classlabel = f"{self.names[int(cls)]} {conf:.2f}"if crop:file = save_dir / "crops" / self.names[int(cls)] / self.files[i] if save else Nonecrops.append({"box": box,"conf": conf,"cls": cls,"label": label,"im": save_one_box(box, im, file=file, save=save),})else: # all others# 盒子标签annotator.box_label(box, label if labels else "", color=colors(cls))im = annotator.imelse:s += "(no detections)"im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im # from npif show:if is_jupyter():from IPython.display import displaydisplay(im)else:im.show(self.files[i])if save:f = self.files[i]im.save(save_dir / f) # saveif i == self.n - 1:LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to {colorstr('bold', save_dir)}")if render:self.ims[i] = np.asarray(im)if pprint:s = s.lstrip("\n")return f"{s}\nSpeed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {self.s}" % self.tif crop:if save:LOGGER.info(f"Saved results to {save_dir}\n")return crops@TryExcept("Showing images is not supported in this environment")def show(self, labels=True):"""Displays detection results with optional labels.Usage: show(labels=True)"""self._run(show=True, labels=labels) # show resultsdef save(self, labels=True, save_dir="runs/detect/exp", exist_ok=False):"""Saves detection results with optional labels to a specified directory.Usage: save(labels=True, save_dir='runs/detect/exp', exist_ok=False)"""save_dir = increment_path(save_dir, exist_ok, mkdir=True) # increment save_dirself._run(save=True, labels=labels, save_dir=save_dir) # save resultsdef crop(self, save=True, save_dir="runs/detect/exp", exist_ok=False):"""Crops detection results, optionally saves them to a directory.Args: save (bool), save_dir (str), exist_ok (bool)."""save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else Nonereturn self._run(crop=True, save=save, save_dir=save_dir) # crop resultsdef render(self, labels=True):"""Renders detection results with optional labels on images; args: labels (bool) indicating label inclusion."""self._run(render=True, labels=labels) # render resultsreturn self.imsdef pandas(self):"""Returns detections as pandas DataFrames for various box formats (xyxy, xyxyn, xywh, xywhn).Example: print(results.pandas().xyxy[0])."""new = copy(self) # return copyca = "xmin", "ymin", "xmax", "ymax", "confidence", "class", "name" # xyxy columnscb = "xcenter", "ycenter", "width", "height", "confidence", "class", "name" # xywh columnsfor k, c in zip(["xyxy", "xyxyn", "xywh", "xywhn"], [ca, ca, cb, cb]):a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # updatesetattr(new, k, [pd.DataFrame(x, columns=c) for x in a])return newdef tolist(self):"""Converts a Detections object into a list of individual detection results for iteration.Example: for result in results.tolist():"""r = range(self.n) # iterablereturn [Detections([self.ims[i]],[self.pred[i]],[self.files[i]],self.times,self.names,self.s,)for i in r]def print(self):"""Logs the string representation of the current object's state via the LOGGER."""LOGGER.info(self.__str__())def __len__(self):"""Returns the number of results stored, overrides the default len(results)."""return self.ndef __str__(self):"""Returns a string representation of the model's results, suitable for printing, overrides defaultprint(results)."""return self._run(pprint=True) # print resultsdef __repr__(self):"""Returns a string representation of the YOLOv5 object, including its class and formatted results."""return f"YOLOv5 {self.__class__} instance\n" + self.__str__()class Proto(nn.Module):"""YOLOv5 mask Proto module for segmentation models, performing convolutions and upsampling on input tensors."""def __init__(self, c1, c_=256, c2=32):"""Initializes YOLOv5 Proto module for segmentation with input, proto, and mask channels configuration."""super().__init__()self.cv1 = Conv(c1, c_, k=3)self.upsample = nn.Upsample(scale_factor=2, mode="nearest")self.cv2 = Conv(c_, c_, k=3)self.cv3 = Conv(c_, c2)def forward(self, x):"""Performs a forward pass using convolutional layers and upsampling on input tensor `x`."""return self.cv3(self.cv2(self.upsample(self.cv1(x))))# 分类模块
class Classify(nn.Module):"""YOLOv5 classification head with convolution, pooling, and dropout layers for channel transformation."""def __init__(self, c1, c2, k=1, s=1, p=None, g=1, dropout_p=0.0): # ch_in, ch_out, kernel, stride, padding, groups, dropout probability"""Initializes YOLOv5 classification head with convolution, pooling, and dropout layers for input to outputchannel transformation."""super().__init__()c_ = 1280 # efficientnet_b0 sizeself.conv = Conv(c1, c_, k, s, autopad(k, p), g)self.pool = nn.AdaptiveAvgPool2d(1) # to x(b,c_,1,1)self.drop = nn.Dropout(p=dropout_p, inplace=True)self.linear = nn.Linear(c_, c2) # to x(b,c2)def forward(self, x):"""Processes input through conv, pool, drop, and linear layers; supports list concatenation input."""if isinstance(x, list):x = torch.cat(x, 1)return self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
总的来说没那么复杂