YOLOv5代码解读[01] train.py

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
import argparse
import math
import os
import random
import sys
import time
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import numpy as np
import yaml
from tqdm import tqdmimport torch
import torch.distributed as dist
import torch.nn as nn
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import SGD, Adam, AdamW, lr_scheduler# 解析成绝对路径
FILE = Path(__file__).resolve()
# YOLOv5 root directory
ROOT = FILE.parents[0]  
# add ROOT to PATH
if str(ROOT) not in sys.path:sys.path.append(str(ROOT)) 
# 用os.path.relpath把绝对路径转换为相对路径relative
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  import val  
from models.experimental import attempt_load
from models.yolo import Model
# from models2.yolo import Model
from utils.autoanchor import check_anchors
from utils.autobatch import check_train_batch_size
from utils.callbacks import Callbacks
from utils.datasets import create_dataloader
from utils.downloads import attempt_download
from utils.general import (LOGGER, check_dataset, check_file, check_git_status, check_img_size, check_requirements,check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds,intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle,print_args, print_mutation, strip_optimizer)
from utils.loggers import Loggers
from utils.loggers.wandb.wandb_utils import check_wandb_resume
from utils.loss import ComputeLoss, ComputeLossOTA
from utils.metrics import fitness
from utils.plots import plot_lr_scheduler, plot_evolve, plot_labels
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first# DDP模式
# pytorch中的有两种分布式训练方式，一种是常用的DataParallel(DP)，另外一种是DistributedDataParallel(DDP)，两者都可以用来实现数据并行方式的分布式训练。
# DP是单进程多线程的实现方式，DDP是采用多进程的方式，DDP相比于DP训练速度要快。
# (1) 使用 torch.distributed.init_process_group 初始化进程组
# (2) 使用 torch.nn.parallel.DistributedDataParallel 创建分布式模型
# (3) 使用 torch.utils.data.distributed.DistributedSampler 创建 DataLoader
# (4) 调整其他必要的地方(tensor放到指定device上， S/L checkpoint，指标计算等)
# (5) 使用 torch.distributed.launch / torch.multiprocessing 或 slurm 开始训练
# 设置DDP模式的参数，world_size:表示全局进程个数，global_rank:进程编号，总共有多少个GPU。
# 只有多机多卡的时候,才会有'WORLD_SIZE'和'RANK'环境变量。因此来说，参数1和-1是单机多卡，单机单卡，CPU的情形# 设置DDP模式变量
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
# world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
# global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
# rank = global_rankdef train(hyp, opt, device, callbacks):version = opt.versionsave_dir = Path(opt.save_dir)epochs = opt.epochsbatch_size = opt.batch_sizeweights = opt.weightssingle_cls = opt.single_clsevolve = opt.evolvedata = opt.datacfg = opt.cfgresume = opt.resumenoval = opt.noval nosave = opt.nosaveworkers = opt.workersfreeze = opt.freeze#-------------------------------------------------------------------------------------------#"                                     训练权重保存路径                                       "#-------------------------------------------------------------------------------------------## 训练权重保存路径w = save_dir / 'weights'  (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir# 最后的模型last, 最好的模型bestlast = w / 'last.pt'best = w / 'best.pt'#-------------------------------------------------------------------------------------------#"                                   超参数Hyperparameters加载                                "#-------------------------------------------------------------------------------------------## 超参数Hyperparametersif isinstance(hyp, str):with open(hyp, errors='ignore') as f:# 加载hyps字典hyp = yaml.safe_load(f)  LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))# 保存opt.yaml和hyp.yaml配置if not evolve:with open(save_dir / 'hyp.yaml', 'w') as f:yaml.safe_dump(hyp, f, sort_keys=False)with open(save_dir / 'opt.yaml', 'w') as f:yaml.safe_dump(vars(opt), f, sort_keys=False)# 日志打印Loggersdata_dict = Noneif RANK in [-1, 0]:loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  if loggers.wandb:data_dict = loggers.wandb.data_dictif resume:weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size# Register actionsfor k in methods(loggers):callbacks.register_action(k, callback=getattr(loggers, k))# 是否绘制曲线plots = not evolve  cuda = device.type != 'cpu'init_seeds(1 + RANK)#-------------------------------------------------------------------------------------------#"                                      dataset数据集加载                                     "#-------------------------------------------------------------------------------------------## data: /home/easyits/road-risk-identification/yolo/YOLOV5/data/coco.yamlwith torch_distributed_zero_first(LOCAL_RANK):data_dict = data_dict or check_dataset(data)  # data_dict: # {'path': './datasets/roadrisk', #  'train': '/home/easyits/road-risk-identification/yolo/YOLOV5/datasets/roadrisk/train2017.txt', #  'val': '/home/easyits/road-risk-identification/yolo/YOLOV5/datasets/roadrisk/val2017.txt', #  'test': '/home/easyits/road-risk-identification/yolo/YOLOV5/datasets/roadrisk/val2017.txt', #  'nc': 6, 'names': ['build', 'person', 'sink', 'garbage', 'pit', 'trouble']} train_path = data_dict['train'] val_path = data_dict['val']# 类别数目class numbersnc = 1 if single_cls else int(data_dict['nc']) # 类别名字class namesnames = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # 判断类别设置是否一致assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # COCO datasetis_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') #-------------------------------------------------------------------------------------------#"                                        Model模型加载                                      "#-------------------------------------------------------------------------------------------## 加载预训练权重if version == 1:# 检查weights的后缀check_suffix(weights, '.pt')  pretrained = weights.endswith('.pt')if pretrained:# 如果在本地没有找到的话，就尝试下载# torch_distributed_zero_first(LOCAL_RANK): 用于同步不同进程对数据读取的上下文管理器with torch_distributed_zero_first(LOCAL_RANK):# google云盘下载weights = attempt_download(weights)  # 加载保存的checkpoint# load checkpoint to CPU to avoid CUDA memory leak# torch.load()会同时保存和加载模型的参数和结构信息ckpt = torch.load(weights, map_location='cpu')  # 定义模型结构model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # 排除exclude keys# 从零开始训练，由于anchor需要重新聚类，anchor要排除；断点续练，直接接在anchor;exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] csd = ckpt['model'].float().state_dict()  # 交集intersectcsd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  # 模型加载状态字典# 设置strict参数为False，可以忽略那些没有匹配到的keys。model.load_state_dict(csd, strict=False)  LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # 构建模型else:model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) elif version == 2:# 检查weights的后缀check_suffix(weights, ['.pt', '.pth'])  pretrained = weights.endswith('.pt') or weights.endswith('.pth')if pretrained:ckpt = torch.load(weights, map_location=device)model = Model(cfg).to(device)if resume:# 排除exclude keys# 从零开始训练，由于anchor需要重新聚类，anchor要排除；断点续练，直接接在anchor;exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] csd = ckpt['model'].float().state_dict()  # 交集intersectcsd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  # 模型加载状态字典# 设置strict参数为False，可以忽略那些没有匹配到的keys。model.load_state_dict(csd, strict=False)  else:# 删除有关分类类别的权重for k in list(ckpt.keys()):if "head" in k:del ckpt[k]if 'model' in ckpt.keys():csd = ckpt['model']  else:csd = ckpt # 模型加载状态字典# 设置strict参数为False，可以忽略那些没有匹配到的keys。model.load_state_dict(csd, strict=False)miss_key, unexpected_key = model.backbone.load_state_dict(csd, strict=False)print("预训练权重加载结果: \n") # LOGGER.info(f'miss_key:{miss_key}')# LOGGER.info(f'unexpected_key:{unexpected_key}')else:model = Model(cfg).to(device)  else:pass# Freeze是否冻住某些网络层或全部层# freeze: [0]freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))]  # freeze: []# model.state_dict(): 字典的遍历默认是遍历key, 例如:conv1.weight, conv1.bias。# model.parameters()# model.named_parameters(): 字典的遍历是一个元组tuple ,元组的第一个元素是参数所对应的名称，第二个元素就是对应的参数值。for k, v in model.named_parameters():# 训练所有的层train all layersv.requires_grad = True # 冻住某些层# k: model.2.cv2.bn.weight, model.2.m.0.cv1.conv.weight
YOLOv5代码解读[01] train.py

相关文章

文件包含+文件上传漏洞（图片马绕过）

网络安全-pikachu之文件上传漏洞1

☀️将大华摄像头画面接入Unity 【1】配置硬件和初始化摄像头

【知识点】CNN中concat与add的区别

工业自动化部署选择主板的关键因素

搜维尔科技：分析OptiTrack光学动作捕捉应用领域！

trojan 突然无法上网

淘宝、1688以图搜图api使用示例

组态软件行业分析：预计2025年市场空间可达数千亿元

[office] Excel中DCOUNT函数在复杂的数据中统计应用图解教程 #职场发展#其他#媒体

佳能2580的下载手册

Eureka注册中心：实现微服务架构下的服务发现与治理的艺术（一）

【ansible】认识ansible，了解常用的模块

【附代码】Python Excel合并单元格（OpenPyXL） Pandas.DataFrame groupby样式保存xlsx

解释 C++ 中的静态成员变量和静态成员函数。

Bert基础(二)--多头注意力

在本地计算机上运行Python程序

linux监控系统资源命令

Python六级考试笔记

3、安装插件