一、训练部分相关代码详解
tools/train.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.import argparse
import random
import warnings
from loguru import loggerimport torch
import torch.backends.cudnn as cudnnfrom yolox.core import Trainer, launch
from yolox.exp import get_exp
from yolox.utils import configure_nccl, configure_omp, get_num_devicesdef make_parser():parser = argparse.ArgumentParser("YOLOX train parser")parser.add_argument("-expn", "--experiment-name", type=str, default='Dark')#设置输出名称,输出地址在YOLO_outputs下parser.add_argument("-n", "--name", type=str, default=None, help="model name")#模型的类型,有s、x、m、l等parser.add_argument("-b", "--batch-size", type=int, default=8, help="batch size")#批次parser.add_argument("-d", "--devices", default=0, type=int, help="device for training"#gpu or cpu)parser.add_argument("-f","--exp_file",default='../exps/yolox_voc_s.py',#实验描述文件,数据配置等type=str,help="plz input your experiment description file",)parser.add_argument("--resume", default=False, action="store_true", help="resume training")# ---------------中断后,恢复训练等一些文件------------parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file")parser.add_argument("-e","--start_epoch",default=None,type=int,help="resume training start epoch",)parser.add_argument("--num_machines", default=1, type=int, help="num of node for training"#多卡设置)parser.add_argument("--machine_rank", default=0, type=int, help="node rank for multi-node training"#多节点训练的节点等级)parser.add_argument("--fp16",dest="fp16",default=False,action="store_true",help="Adopting mix precision training.",)parser.add_argument("--cache",dest="cache",default=False,action="store_true",help="Caching imgs to RAM for fast training.",#将图像缓存到内存/磁盘以进行快速训练)parser.add_argument("-o","--occupy",dest="occupy",default=True,action="store_true",help="occupy GPU memory first for training.",#首先占用GPU内存进行训练)parser.add_argument("opts",help="Modify config options using the command-line",#使用命令行修改配置选项default=None,nargs=argparse.REMAINDER,)return parser@logger.catch
def main(exp, args):# if exp.seed is not None:# random.seed(exp.seed)# torch.manual_seed(exp.seed)# cudnn.deterministic = True# warnings.warn(# "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "# "which can slow down your training considerably! You may see unexpected behavior "# "when restarting from checkpoints."# )# set environment variables for distributed trainingconfigure_nccl()configure_omp()cudnn.benchmark = Truetrainer = Trainer(exp, args)#train.py跳转1trainer.train()#train.py跳转2if __name__ == "__main__":args = make_parser().parse_args()exp = get_exp(args.exp_file, args.name)exp.merge(args.opts)if not args.experiment_name:args.experiment_name = exp.exp_namenum_gpu = get_num_devices() if args.devices is None else args.devicesassert num_gpu <= get_num_devices()launch(main,num_gpu,args.num_machines,args.machine_rank,args=(exp, args),)
yolox/core/trainer.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.import datetime
import os
import time
from loguru import loggerimport torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriterfrom yolox.data import DataPrefetcher
from yolox.utils import (MeterBuffer,ModelEMA,all_reduce_norm,get_local_rank,get_model_info,get_rank,get_world_size,gpu_mem_usage,is_parallel,load_ckpt,occupy_mem,save_checkpoint,setup_logger,synchronize
)#跳转1
class Trainer:def __init__(self, exp, args):# init function only defines some basic attr, other attrs like model, optimizer are built in# before_train methods.self.exp = exp #数据参数配置self.args = args #训练参数配置# training related attrself.max_epoch = exp.max_epochself.amp_training = args.fp16self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)self.is_distributed = get_world_size() > 1 #分布式训练self.rank = get_rank() #用来判断torch版本和分布式self.local_rank = get_local_rank() #用来配置cuda,采用gpu进行训练,跳转get_local_rank()self.device = "cuda:{}".format(self.local_rank)#跳转self.local_rankself.use_model_ema = exp.ema #ema所得的数据是在测试时候使用的,这里不进行梯度下降#取最后n步权重进行平均,能使得模型更加鲁棒# data/dataloader related attrself.data_type = torch.float16 if args.fp16 else torch.float32self.input_size = exp.input_sizeself.best_ap = 0# metric recordself.meter = MeterBuffer(window_size=exp.print_interval) #计算并存储平均值和当前值self.file_name = os.path.join(exp.output_dir, args.experiment_name) #输出地址if self.rank == 0:os.makedirs(self.file_name, exist_ok=True) #创建输出文件夹,若exist_ok为False则抛出OSErrorsetup_logger( #日志self.file_name,distributed_rank=self.rank,filename="train_log.txt",mode="a",)
#跳转1返回到train.pydef train(self):#train.py跳转2self.before_train()#trainer.py跳转1 有模型定义.get_model()try:self.train_in_epoch()#trainer.py跳转2 中间参数更新定义except Exception:raisefinally:self.after_train() #trainer.py跳转3 保存权重收尾#trainer.py跳转2def train_in_epoch(self):for self.epoch in range(self.start_epoch, self.max_epoch):self.before_epoch() #跳转 打印部分超参数分配细节self.train_in_iter() #跳转 定义参数更新、打印输出细节self.after_epoch() #跳转 收尾,保存权重def train_in_iter(self):for self.iter in range(self.max_iter):self.before_iter() #passself.train_one_iter() #中间参数更新过程self.after_iter() #终端打印输出细节def train_one_iter(self):iter_start_time = time.time() #数据处理计时inps, targets = self.prefetcher.next() #下一轮数据加载inps = inps.to(self.data_type) #怎样训练,16 or 32targets = targets.to(self.data_type)targets.requires_grad = Falseinps, targets = self.exp.preprocess(inps, targets, self.input_size) #统一输入图片尺寸640*640data_end_time = time.time()with torch.cuda.amp.autocast(enabled=self.amp_training): # 前向过程model开启autocastoutputs = self.model(inps, targets)loss = outputs["total_loss"]self.optimizer.zero_grad() # 梯度初始化为零,把loss关于weight的导数变成0self.scaler.scale(loss).backward() # scaler实现的反向误差传播self.scaler.step(self.optimizer) # 优化器中的值也需要放缩self.scaler.update() # 更新scalerif self.use_model_ema: #使用滑动平均self.ema_model.update(self.model)lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1) #更新学习率for param_group in self.optimizer.param_groups:param_group["lr"] = lriter_end_time = time.time()self.meter.update( #存储数值更新iter_time=iter_end_time - iter_start_time,data_time=data_end_time - iter_start_time,lr=lr,**outputs,)
# trainer.py跳转2跳回#trainer.py跳转1def before_train(self):logger.info("args: {}".format(self.args))#打印argslogger.info("exp value:\n{}".format(self.exp))#打印exp# model related inittorch.cuda.set_device(self.local_rank)#cpu or gpu,跳到->self.local_rankmodel = self.exp.get_model() #模型进入口,跳到->yolox.exp.yolox_base->.models._init_.py->darknet.py->network_blocks.pylogger.info("Model Summary: {}".format(get_model_info(model, self.exp.test_size))#训练的一些配置)model.to(self.device) #将模型调到gpu上# solver related init初始化偏执b 权重w 归一化bn 优化器sgdself.optimizer = self.exp.get_optimizer(self.args.batch_size)#跳转.get_optimizer# value of epoch will be set in `resume_train`model = self.resume_train(model) #判断是否中途启动# data related initself.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs#若max_epoch=20,no_aug_epochs不使用增强的epoch=15#那么self.max_epoch - self.exp.no_aug_epochs = 5,也就是说只有前5轮使用数据增强方法#即no_aug在前5轮的时候为falseself.train_loader = self.exp.get_data_loader( #跳到yolox_voc_s.py,完成一些关于数据的处理batch_size=self.args.batch_size,is_distributed=self.is_distributed,no_aug=self.no_aug,cache_img=self.args.cache,)logger.info("init prefetcher, this might take one minute or less...")self.prefetcher = DataPrefetcher(self.train_loader)#跳转DataPrefetcher()->当模型在 GPU 中运行时,有另一个流将数据发送到 GPU 内存,使得两次迭代之间的差距非常小。# max_iter means iters per epochself.max_iter = len(self.train_loader) #表示每次batch_size中待处理数据量长度self.lr_scheduler = self.exp.get_lr_scheduler(self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter) #学习率lr=0.01/64.0 * batch_sizeif self.args.occupy:occupy_mem(self.local_rank) #先可gpu使用if self.is_distributed: #并行训练DDP or DPmodel = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)if self.use_model_ema: #使用ema滑动平均self.ema_model = ModelEMA(model, 0.9998) #导入ema_model,跳转ModelEMA()看细节self.ema_model.updates = self.max_iter * self.start_epoch #导入ema_model.updates数值,其与衰减率计算有关,计算vself.model = modelself.model.train()self.evaluator = self.exp.get_evaluator( #评估模型,跳转至yolox_voc_s.py->yolox/evaluators/voc_evaluator.pybatch_size=self.args.batch_size, is_distributed=self.is_distributed)# Tensorboard logger日记if self.rank == 0:self.tblogger = SummaryWriter(self.file_name)logger.info("Training start...")logger.info("\n{}".format(model))#模型打印
#trainer.py跳转1跳回# trainer.py跳转3def after_train(self):logger.info("Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100))
# trainer.py跳转3跳回def before_epoch(self):logger.info("---> start train epoch{}".format(self.epoch + 1))if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:logger.info("--->No mosaic aug now!")self.train_loader.close_mosaic()logger.info("--->Add additional L1 loss now!") #最后15轮if self.is_distributed: #分布式训练self.model.module.head.use_l1 = Trueelse:self.model.head.use_l1 = Trueself.exp.eval_interval = 1if not self.no_aug:self.save_ckpt(ckpt_name="last_mosaic_epoch")def after_epoch(self):self.save_ckpt(ckpt_name="latest")if (self.epoch + 1) % self.exp.eval_interval == 0:all_reduce_norm(self.model)self.evaluate_and_save_model()
# trainer.py跳转2跳回def before_iter(self):passdef after_iter(self): #终端打印输出细节"""`after_iter` contains two parts of logic:* log information* reset setting of resize"""# log needed informationif (self.iter + 1) % self.exp.print_interval == 0:# TODO check ETA logicleft_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)eta_seconds = self.meter["iter_time"].global_avg * left_iterseta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))progress_str = "epoch: {}/{}, iter: {}/{}".format(self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter)loss_meter = self.meter.get_filtered_meter("loss")loss_str = ", ".join(["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()])time_meter = self.meter.get_filtered_meter("time")time_str = ", ".join(["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()])logger.info("{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format(progress_str,gpu_mem_usage(),time_str,loss_str,self.meter["lr"].latest,)+ (", size: {:d}, {}".format(self.input_size[0], eta_str)))self.meter.clear_meters()# random resizingif (self.progress_in_iter + 1) % 10 == 0:self.input_size = self.exp.random_resize(self.train_loader, self.epoch, self.rank, self.is_distributed)@propertydef progress_in_iter(self):return self.epoch * self.max_iter + self.iterdef resume_train(self, model):if self.args.resume:logger.info("resume training")if self.args.ckpt is None:ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth")else:ckpt_file = self.args.ckptckpt = torch.load(ckpt_file, map_location=self.device)# resume the model/optimizer state dictmodel.load_state_dict(ckpt["model"])self.optimizer.load_state_dict(ckpt["optimizer"])# resume the training states variablesstart_epoch = (self.args.start_epoch - 1if self.args.start_epoch is not Noneelse ckpt["start_epoch"])self.start_epoch = start_epochlogger.info("loaded checkpoint '{}' (epoch {})".format(self.args.resume, self.start_epoch)) # noqaelse:if self.args.ckpt is not None:logger.info("loading checkpoint for fine tuning")ckpt_file = self.args.ckptckpt = torch.load(ckpt_file, map_location=self.device)["model"]model = load_ckpt(model, ckpt)self.start_epoch = 0return modeldef evaluate_and_save_model(self):if self.use_model_ema:evalmodel = self.ema_model.emaelse:evalmodel = self.modelif is_parallel(evalmodel):evalmodel = evalmodel.moduleap50_95, ap50, summary = self.exp.eval(evalmodel, self.evaluator, self.is_distributed)self.model.train()if self.rank == 0:self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)logger.info("\n" + summary)synchronize()self.save_ckpt("last_epoch", ap50_95 > self.best_ap)self.best_ap = max(self.best_ap, ap50_95)def save_ckpt(self, ckpt_name, update_best_ckpt=False):if self.rank == 0:save_model = self.ema_model.ema if self.use_model_ema else self.modellogger.info("Save weights to {}".format(self.file_name))ckpt_state = {"start_epoch": self.epoch + 1,"model": save_model.state_dict(),"optimizer": self.optimizer.state_dict(),}save_checkpoint(ckpt_state,update_best_ckpt,self.file_name,ckpt_name,)
二、参数初始化设定相关代码详解
yolox/exp/yolox_base.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.import os
import randomimport torch
import torch.distributed as dist
import torch.nn as nnfrom .base_exp import BaseExp# 这个文件是yolox最全的参数设定,可以称为基底参数设定文件,想要改变参数的初始化值,首先查看该文件。
# 同理yolox_voc_s.py也是基于这个文件额外进行数据集地址添加。
class Exp(BaseExp): #跳转1,点击Exp进入yolox_voc_s.pydef __init__(self):super().__init__()# ---------------- yolox基本设定 ---------------- #self.num_classes = 12 #数据集格式self.depth = 1.00self.width = 1.00self.act = 'silu'# ---------------- 数据加载器参数设定 ---------------- ## set worker to 4 for shorter dataloader init timeself.data_num_workers = 4 #数据加载器个数 管不管都行self.input_size = (640, 640) # (height, width)# Actual multiscale ranges: [640-5*32, 640+5*32].# To disable multiscale training, set the# self.multiscale_range to 0.self.multiscale_range = 5 #[640-5*32, 640+5*32]中的5,默认不动# You can uncomment this line to specify a multiscale range# self.random_size = (14, 26)#coco数据集设定,不用管,看yolox_voc_s.py----------------------------------------------self.data_dir = Noneself.train_ann = "instances_train2017.json"self.val_ann = "instances_val2017.json"self.test_ann = "instances_test2017.json"# --------------- 数据增强参数设定 ----------------- #self.mosaic_prob = 1.0self.mixup_prob = 1.0self.hsv_prob = 1.0self.flip_prob = 0.5self.degrees = 10.0self.translate = 0.1self.mosaic_scale = (0.1, 2)self.mixup_scale = (0.5, 1.5)self.shear = 2.0self.enable_mixup = True# -------------- 训练参数设定 --------------------- #self.warmup_epochs = 5self.max_epoch = 20 #epoch设定self.warmup_lr = 0self.basic_lr_per_img = 0.01 / 64.0 #参与学习率计算,具体在trainer.py中self.scheduler = "yoloxwarmcos"self.no_aug_epochs = 15 #训练最后15轮停止使用数据增强self.min_lr_ratio = 0.05self.ema = True #使用滑动平均self.weight_decay = 5e-4self.momentum = 0.9self.print_interval = 1 #多少轮评估一次模型self.eval_interval = 1 #多少轮评估一次模型self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]# ----------------- 测试参数设定 ------------------ #self.test_size = (640, 640)self.test_conf = 0.01self.nmsthre = 0.65def get_model(self): #trainer.py跳转1from yolox.models import YOLOX, YOLOPAFPN, YOLOXHeaddef init_yolo(M): #初始化模型参数for m in M.modules():if isinstance(m, nn.BatchNorm2d):m.eps = 1e-3m.momentum = 0.03if getattr(self, "model", None) is None:in_channels = [256, 512, 1024]backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act) #backbone跳入2head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act) #head跳入3self.model = YOLOX(backbone, head) #构建模型self.model.apply(init_yolo) #初始化模型权重参数self.model.head.initialize_biases(1e-2) #初始化检测头权重参数return self.model#使用coco数据集时采用,这里不看----------------------使用voc看yolox_voc_s.pydef get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):from yolox.data import (COCODataset,TrainTransform,YoloBatchSampler,DataLoader,InfiniteSampler,MosaicDetection,worker_init_reset_seed,)from yolox.utils import (wait_for_the_master,get_local_rank,)local_rank = get_local_rank()with wait_for_the_master(local_rank):dataset = COCODataset(data_dir=self.data_dir,json_file=self.train_ann,img_size=self.input_size,preproc=TrainTransform(max_labels=50,flip_prob=self.flip_prob,hsv_prob=self.hsv_prob),cache=cache_img,)dataset = MosaicDetection(dataset,mosaic=not no_aug,img_size=self.input_size,preproc=TrainTransform(max_labels=120,flip_prob=self.flip_prob,hsv_prob=self.hsv_prob),degrees=self.degrees,translate=self.translate,mosaic_scale=self.mosaic_scale,mixup_scale=self.mixup_scale,shear=self.shear,enable_mixup=self.enable_mixup,mosaic_prob=self.mosaic_prob,mixup_prob=self.mixup_prob,)self.dataset = datasetif is_distributed:batch_size = batch_size // dist.get_world_size()sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)batch_sampler = YoloBatchSampler(sampler=sampler,batch_size=batch_size,drop_last=False,mosaic=not no_aug,)dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}dataloader_kwargs["batch_sampler"] = batch_sampler# Make sure each process has different random seed, especially for 'fork' method.# Check https://github.com/pytorch/pytorch/issues/63311 for more details.dataloader_kwargs["worker_init_fn"] = worker_init_reset_seedtrain_loader = DataLoader(self.dataset, **dataloader_kwargs)return train_loaderdef random_resize(self, data_loader, epoch, rank, is_distributed):tensor = torch.LongTensor(2).cuda() #切换张量为长整型if rank == 0:size_factor = self.input_size[1] * 1.0 / self.input_size[0]if not hasattr(self, 'random_size'):min_size = int(self.input_size[0] / 32) - self.multiscale_rangemax_size = int(self.input_size[0] / 32) + self.multiscale_rangeself.random_size = (min_size, max_size)size = random.randint(*self.random_size)size = (int(32 * size), 32 * int(size * size_factor))tensor[0] = size[0]tensor[1] = size[1]if is_distributed:dist.barrier()dist.broadcast(tensor, 0)input_size = (tensor[0].item(), tensor[1].item())return input_sizedef preprocess(self, inputs, targets, tsize):scale_y = tsize[0] / self.input_size[0]scale_x = tsize[1] / self.input_size[1]if scale_x != 1 or scale_y != 1:inputs = nn.functional.interpolate(inputs, size=tsize, mode="bilinear", align_corners=False)targets[..., 1::2] = targets[..., 1::2] * scale_xtargets[..., 2::2] = targets[..., 2::2] * scale_yreturn inputs, targetsdef get_optimizer(self, batch_size):if "optimizer" not in self.__dict__:if self.warmup_epochs > 0:lr = self.warmup_lrelse:lr = self.basic_lr_per_img * batch_sizepg0, pg1, pg2 = [], [], [] # optimizer parameter groupsfor k, v in self.model.named_modules():if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):pg2.append(v.bias) # biasesif isinstance(v, nn.BatchNorm2d) or "bn" in k:pg0.append(v.weight) # no decayelif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):pg1.append(v.weight) # apply decayoptimizer = torch.optim.SGD(pg0, lr=lr, momentum=self.momentum, nesterov=True)optimizer.add_param_group({"params": pg1, "weight_decay": self.weight_decay}) # add pg1 with weight_decayoptimizer.add_param_group({"params": pg2})self.optimizer = optimizerreturn self.optimizerdef get_lr_scheduler(self, lr, iters_per_epoch):from yolox.utils import LRSchedulerscheduler = LRScheduler(self.scheduler,lr,iters_per_epoch,self.max_epoch,warmup_epochs=self.warmup_epochs,warmup_lr_start=self.warmup_lr,no_aug_epochs=self.no_aug_epochs,min_lr_ratio=self.min_lr_ratio,)return schedulerdef get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):from yolox.data import COCODataset, ValTransformvaldataset = COCODataset(data_dir=self.data_dir,json_file=self.val_ann if not testdev else self.test_ann,name="val2017" if not testdev else "test2017",img_size=self.test_size,preproc=ValTransform(legacy=legacy),)if is_distributed:batch_size = batch_size // dist.get_world_size()sampler = torch.utils.data.distributed.DistributedSampler(valdataset, shuffle=False)else:sampler = torch.utils.data.SequentialSampler(valdataset)dataloader_kwargs = {"num_workers": self.data_num_workers,"pin_memory": True,"sampler": sampler,}dataloader_kwargs["batch_size"] = batch_sizeval_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)return val_loaderdef get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):from yolox.evaluators import COCOEvaluatorval_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)evaluator = COCOEvaluator(dataloader=val_loader,img_size=self.test_size,confthre=self.test_conf,nmsthre=self.nmsthre,num_classes=self.num_classes,testdev=testdev,)return evaluatordef eval(self, model, evaluator, is_distributed, half=False):return evaluator.evaluate(model, is_distributed, half)
# coco数据集----------------------------------------
yolox/exp/base_exp.py部分代码
from abc import ABCMeta, abstractmethod
from typing import Dict
from tabulate import tabulateimport torch
from torch.nn import Modulefrom yolox.utils import LRSchedulerclass BaseExp(metaclass=ABCMeta):"""Basic class for any experiment."""def __init__(self):self.seed = Noneself.output_dir = "./YOLOX_outputs" #训练时输出的地址self.print_interval = 100 #多少轮评估一次模型,这些在yolox_base.py重新完成设定self.eval_interval = 10 #多少轮评估一次模型,这些在yolox_base.py重新完成设定
exp/yolox_voc_s.py
# encoding: utf-8
import osimport torch
import torch.distributed as distfrom yolox.data import get_yolox_datadir
from yolox.exp import Exp as MyExp#这个文件才是最终模型在训练时的初始化参数,基于yolox_base.py
class Exp(MyExp):def __init__(self):super(Exp, self).__init__()# ---------------- yolox基本设定 ---------------- #self.num_classes = 12 #数据集类别数self.depth = 0.33self.width = 0.50self.warmup_epochs = 1# ---------- 数据增强参数设定 ------------ #self.mosaic_prob = 1.0self.mixup_prob = 1.0self.hsv_prob = 1.0self.flip_prob = 0.5self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):from yolox.data import (VOCDetection, #voc数据获取定义,跳转TrainTransform, #数据增强=旋转+hsvYoloBatchSampler, #批量采样器将从另一个采样器生成小批量(马赛克,索引)元组。DataLoader, #数据加载器InfiniteSampler, #类似于引流器,让数据不间断MosaicDetection, #执行mixup mosaic_probworker_init_reset_seed, #随机种子)from yolox.utils import (wait_for_the_master, #是否分布式训练get_local_rank, #gpu or cpu)local_rank = get_local_rank()with wait_for_the_master(local_rank): #分布式训练就调用dist.barrier(),使得所有进程同步进行。dataset = VOCDetection(data_dir="D:/darkyolo/YOLOX-main/datasets/VOCdevkit/VOC2007/", #数据集地址image_sets=[('train')], #这里只有一个索引,因此需要改voc.py中name=self.image_set[0][1]修改为[0]img_size=self.input_size,preproc=TrainTransform(#preproc为类TrainTransform生成的对象,由于TrainTransform()使用def __call__,因此preproc可以被调用max_labels=100,flip_prob=self.flip_prob,hsv_prob=self.hsv_prob),cache=cache_img,)dataset = MosaicDetection( #数据增强dataset, #voc数据集mosaic=not no_aug, #trueimg_size=self.input_size,preproc=TrainTransform( #preproc为类TrainTransform生成的对象,由于def __call__使用preproc可以被调用max_labels=120,flip_prob=self.flip_prob, #50%hsv_prob=self.hsv_prob), #truedegrees=self.degrees,translate=self.translate,mosaic_scale=self.mosaic_scale,mixup_scale=self.mixup_scale,shear=self.shear,enable_mixup=self.enable_mixup, #truemosaic_prob=self.mosaic_prob,mixup_prob=self.mixup_prob,)self.dataset = datasetif is_distributed:batch_size = batch_size // dist.get_world_size()sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)batch_sampler = YoloBatchSampler(sampler=sampler,batch_size=batch_size,drop_last=False,mosaic=not no_aug,)dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}dataloader_kwargs["batch_sampler"] = batch_sampler# Make sure each process has different random seed, especially for 'fork' methoddataloader_kwargs["worker_init_fn"] = worker_init_reset_seedtrain_loader = DataLoader(self.dataset, **dataloader_kwargs)return train_loader#测试定义与训练同理def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):from yolox.data import VOCDetection, ValTransformvaldataset = VOCDetection(data_dir="D:/darkyolo/YOLOX-main/datasets/VOCdevkit/VOC2007/",image_sets=[('val')],img_size=self.test_size,preproc=ValTransform(legacy=legacy), #统一输入图片尺寸)if is_distributed:batch_size = batch_size // dist.get_world_size()sampler = torch.utils.data.distributed.DistributedSampler(valdataset, shuffle=False)else:sampler = torch.utils.data.SequentialSampler(valdataset)dataloader_kwargs = {"num_workers": self.data_num_workers,"pin_memory": True,"sampler": sampler,}dataloader_kwargs["batch_size"] = batch_sizeval_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)return val_loader# 模型评估,在网络全部训练完成后执行def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):from yolox.evaluators import VOCEvaluatorval_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)evaluator = VOCEvaluator( #跳转VOCEvaluator()dataloader=val_loader,img_size=self.test_size,confthre=self.test_conf,nmsthre=self.nmsthre,num_classes=self.num_classes,)return evaluator