561 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			561 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #-------------------------------------#
 | ||
| #       对数据集进行训练
 | ||
| #-------------------------------------#
 | ||
| import datetime
 | ||
| import os
 | ||
| from functools import partial
 | ||
| 
 | ||
| import numpy as np
 | ||
| import torch
 | ||
| import torch.backends.cudnn as cudnn
 | ||
| import torch.distributed as dist
 | ||
| import torch.nn as nn
 | ||
| import torch.optim as optim
 | ||
| from torch.utils.data import DataLoader
 | ||
| 
 | ||
| from network.yolo import YoloBody
 | ||
| from network.yolo_training import (ModelEMA, YOLOLoss, get_lr_scheduler,
 | ||
|                                 set_optimizer_lr, weights_init)
 | ||
| from utils.callbacks import EvalCallback, LossHistory
 | ||
| from utils.dataloader import YoloDataset, yolo_dataset_collate
 | ||
| from utils.utils import (download_weights, get_anchors, get_classes,
 | ||
|                          seed_everything, show_config, worker_init_fn)
 | ||
| from utils.utils_fit import fit_one_epoch
 | ||
| 
 | ||
| import configparser
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     conf=configparser.ConfigParser()
 | ||
|     conf.read('config.ini',encoding='utf-8')
 | ||
|     #---------------------------------#
 | ||
|     #   Cuda    是否使用CudaTrue
 | ||
|     #           没有GPU可以设置成False
 | ||
|     #---------------------------------#
 | ||
|     Cuda            = conf.getboolean('Train', 'Cuda')
 | ||
|     #----------------------------------------------#
 | ||
|     #   Seed    用于固定随机种子
 | ||
|     #           使得每次独立训练都可以获得一样的结果
 | ||
|     #----------------------------------------------#
 | ||
|     seed            = conf.getint('Train', 'seed')
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     #   distributed     用于指定是否使用单机多卡分布式运行
 | ||
|     #                   终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
 | ||
|     #                   Windows系统下默认使用DP模式调用所有显卡,不支持DDP。
 | ||
|     #   DP模式:
 | ||
|     #       设置            distributed = False
 | ||
|     #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python train.py
 | ||
|     #   DDP模式:
 | ||
|     #       设置            distributed = True
 | ||
|     #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     distributed     = conf.getboolean('Train', 'distributed')
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     #   sync_bn     是否使用sync_bn,DDP模式多卡可用
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     sync_bn         = conf.getboolean('Train', 'sync_bn')
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     #   fp16        是否使用混合精度训练
 | ||
|     #               可减少约一半的显存、需要pytorch1.7.1以上
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     fp16            = conf.getboolean('Train', 'fp16')
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     #   classes_path    指向model_data下的txt,与自己训练的数据集相关 
 | ||
|     #                   训练前一定要修改classes_path,使其对应自己的数据集
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     classes_path    = conf.get('Train', 'classes_path')
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     #   anchors_path    代表先验框对应的txt文件,一般不修改。
 | ||
|     #   anchors_mask    用于帮助代码找到对应的先验框,一般不修改。
 | ||
|     #---------------------------------------------------------------------#
 | ||
|     anchors_path    = conf.get('Train', 'anchors_path')
 | ||
|     anchors_mask    = eval(conf.get('Train', 'anchors_mask'))
 | ||
|     #----------------------------------------------------------------------------------------------------------------------------#
 | ||
|     #   权值文件的下载请看README,可以通过网盘下载。模型的 预训练权重 对不同数据集是通用的,因为特征是通用的。
 | ||
|     #   模型的 预训练权重 比较重要的部分是 主干特征提取网络的权值部分,用于进行特征提取。
 | ||
|     #   预训练权重对于99%的情况都必须要用,不用的话主干部分的权值太过随机,特征提取效果不明显,网络训练的结果也不会好
 | ||
|     #
 | ||
|     #   如果训练过程中存在中断训练的操作,可以将model_path设置成logs文件夹下的权值文件,将已经训练了一部分的权值再次载入。
 | ||
|     #   同时修改下方的 冻结阶段 或者 解冻阶段 的参数,来保证模型epoch的连续性。
 | ||
|     #   
 | ||
|     #   当model_path = ''的时候不加载整个模型的权值。
 | ||
|     #
 | ||
|     #   此处使用的是整个模型的权重,因此是在train.py进行加载的。
 | ||
|     #   如果想要让模型从0开始训练,则设置model_path = '',下面的Freeze_Train = Fasle,此时从0开始训练,且没有冻结主干的过程。
 | ||
|     #   
 | ||
|     #   一般来讲,网络从0开始的训练效果会很差,因为权值太过随机,特征提取效果不明显,因此非常、非常、非常不建议大家从0开始训练!
 | ||
|     #   从0开始训练有两个方案:
 | ||
|     #   1、得益于Mosaic数据增强方法强大的数据增强能力,将UnFreeze_Epoch设置的较大(300及以上)、batch较大(16及以上)、数据较多(万以上)的情况下,
 | ||
|     #      可以设置mosaic=True,直接随机初始化参数开始训练,但得到的效果仍然不如有预训练的情况。(像COCO这样的大数据集可以这样做)
 | ||
|     #   2、了解imagenet数据集,首先训练分类模型,获得网络的主干部分权值,分类模型的 主干部分 和该模型通用,基于此进行训练。
 | ||
|     #--------------------------------------------------------r--------------------------------------------------------------------#
 | ||
|     model_path      = conf.get('Train', 'model_path')
 | ||
|     #------------------------------------------------------#
 | ||
|     #   input_shape     输入的shape大小,一定要是32的倍数
 | ||
|     #------------------------------------------------------#
 | ||
|     input_shape     = eval(conf.get('Train', 'input_shape'))
 | ||
|     #------------------------------------------------------#
 | ||
|     #   backbone        cspdarknet(默认)
 | ||
|     #                   convnext_tiny
 | ||
|     #                   convnext_small
 | ||
|     #                   swin_transfomer_tiny
 | ||
|     #------------------------------------------------------#
 | ||
|     backbone        = conf.get('Train', 'backbone')
 | ||
|     #----------------------------------------------------------------------------------------------------------------------------#
 | ||
|     #   pretrained      是否使用主干网络的预训练权重,此处使用的是主干的权重,因此是在模型构建的时候进行加载的。
 | ||
|     #                   如果设置了model_path,则主干的权值无需加载,pretrained的值无意义。
 | ||
|     #                   如果不设置model_path,pretrained = True,此时仅加载主干开始训练。
 | ||
|     #                   如果不设置model_path,pretrained = False,Freeze_Train = Fasle,此时从0开始训练,且没有冻结主干的过程。
 | ||
|     #----------------------------------------------------------------------------------------------------------------------------#
 | ||
|     pretrained      = conf.getboolean('Train', 'pretrained')
 | ||
|     #------------------------------------------------------#
 | ||
|     #   phi             所使用的YoloV5的版本。s、m、l、x
 | ||
|     #                   在除cspdarknet的其它主干中仅影响panet的大小
 | ||
|     #------------------------------------------------------#
 | ||
|     phi             = conf.get('Train', 'phi')
 | ||
|     # ------------------------------------------------------------------#
 | ||
|     #   mosaic              马赛克数据增强。
 | ||
|     #   mosaic_prob         每个step有多少概率使用mosaic数据增强,默认50%。
 | ||
|     
 | ||
|     #   mixup               是否使用mixup数据增强,仅在mosaic=True时有效。
 | ||
|     #                       只会对mosaic增强后的图片进行mixup的处理。
 | ||
|     #   mixup_prob          有多少概率在mosaic后使用mixup数据增强,默认50%。
 | ||
|     #                       总的mixup概率为mosaic_prob * mixup_prob。
 | ||
|     
 | ||
|     #   special_aug_ratio   参考YoloX,由于Mosaic生成的训练图片,远远脱离自然图片的真实分布。
 | ||
|     #                       当mosaic=True时,本代码会在special_aug_ratio范围内开启mosaic。
 | ||
|     #                       默认为前70%个epoch,100个世代会开启70个世代。
 | ||
|     # ------------------------------------------------------------------#
 | ||
|     mosaic              = conf.getboolean('Train', 'mosaic')
 | ||
|     mosaic_prob         = conf.getfloat('Train', 'mosaic_prob')
 | ||
|     mixup               = conf.getboolean('Train', 'mixup')
 | ||
|     mixup_prob          = conf.getfloat('Train', 'mixup_prob')
 | ||
|     special_aug_ratio   = conf.getfloat('Train', 'special_aug_ratio')
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   label_smoothing     标签平滑。一般0.01以下。如0.01、0.005。
 | ||
|     #------------------------------------------------------------------#
 | ||
|     label_smoothing     = conf.getfloat('Train', 'label_smoothing')
 | ||
| 
 | ||
|     #----------------------------------------------------------------------------------------------------------------------------#
 | ||
|     #   训练分为两个阶段,分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。
 | ||
|     #   冻结训练需要的显存较小,显卡非常差的情况下,可设置Freeze_Epoch等于UnFreeze_Epoch,Freeze_Train = True,此时仅仅进行冻结训练。
 | ||
|     #      
 | ||
|     #   在此提供若干参数设置建议,各位训练者根据自己的需求进行灵活调整:
 | ||
|     #   (一)从整个模型的预训练权重开始训练: 
 | ||
|     #       Adam:
 | ||
|     #           Init_Epoch = 0,Freeze_Epoch = 50,UnFreeze_Epoch = 100,Freeze_Train = True,optimizer_type = 'adam',Init_lr = 1e-3,weight_decay = 0。(冻结)
 | ||
|     #           Init_Epoch = 0,UnFreeze_Epoch = 100,Freeze_Train = False,optimizer_type = 'adam',Init_lr = 1e-3,weight_decay = 0。(不冻结)
 | ||
|     #       SGD:
 | ||
|     #           Init_Epoch = 0,Freeze_Epoch = 50,UnFreeze_Epoch = 300,Freeze_Train = True,optimizer_type = 'sgd',Init_lr = 1e-2,weight_decay = 5e-4。(冻结)
 | ||
|     #           Init_Epoch = 0,UnFreeze_Epoch = 300,Freeze_Train = False,optimizer_type = 'sgd',Init_lr = 1e-2,weight_decay = 5e-4。(不冻结)
 | ||
|     #       其中:UnFreeze_Epoch可以在100-300之间调整。
 | ||
|     #   (二)从0开始训练:
 | ||
|     #       Init_Epoch = 0,UnFreeze_Epoch >= 300,Unfreeze_batch_size >= 16,Freeze_Train = False(不冻结训练)
 | ||
|     #       其中:UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd',Init_lr = 1e-2,mosaic = True。
 | ||
|     #   (三)batch_size的设置:
 | ||
|     #       在显卡能够接受的范围内,以大为好。显存不足与数据集大小无关,提示显存不足(OOM或者CUDA out of memory)请调小batch_size。
 | ||
|     #       受到BatchNorm层影响,batch_size最小为2,不能为1。
 | ||
|     #       正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大,因为关系到学习率的自动调整。
 | ||
|     #----------------------------------------------------------------------------------------------------------------------------#
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   冻结阶段训练参数
 | ||
|     #   此时模型的主干被冻结了,特征提取网络不发生改变
 | ||
|     #   占用的显存较小,仅对网络进行微调
 | ||
|     #   Init_Epoch          模型当前开始的训练世代,其值可以大于Freeze_Epoch,如设置:
 | ||
|     #                       Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100
 | ||
|     #                       会跳过冻结阶段,直接从60代开始,并调整对应的学习率。
 | ||
|     #                       (断点续练时使用)
 | ||
|     #   Freeze_Epoch        模型冻结训练的Freeze_Epoch
 | ||
|     #                       (当Freeze_Train=False时失效)
 | ||
|     #   Freeze_batch_size   模型冻结训练的batch_size
 | ||
|     #                       (当Freeze_Train=False时失效)
 | ||
|     #------------------------------------------------------------------#
 | ||
|     Init_Epoch          = 0
 | ||
|     Freeze_Epoch        = 50
 | ||
|     Freeze_batch_size   = 10
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   解冻阶段训练参数
 | ||
|     #   此时模型的主干不被冻结了,特征提取网络会发生改变
 | ||
|     #   占用的显存较大,网络所有的参数都会发生改变
 | ||
|     #   UnFreeze_Epoch          模型总共训练的epoch
 | ||
|     #                           SGD需要更长的时间收敛,因此设置较大的UnFreeze_Epoch
 | ||
|     #                           Adam可以使用相对较小的UnFreeze_Epoch
 | ||
|     #   Unfreeze_batch_size     模型在解冻后的batch_size
 | ||
|     #------------------------------------------------------------------#
 | ||
|     UnFreeze_Epoch      = 100
 | ||
|     Unfreeze_batch_size = 4
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   Freeze_Train    是否进行冻结训练
 | ||
|     #                   默认先冻结主干训练后解冻训练。
 | ||
|     #------------------------------------------------------------------#
 | ||
|     Freeze_Train        = True
 | ||
| 
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   其它训练参数:学习率、优化器、学习率下降有关
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   Init_lr         模型的最大学习率
 | ||
|     #   Min_lr          模型的最小学习率,默认为最大学习率的0.01
 | ||
|     #------------------------------------------------------------------#
 | ||
|     Init_lr             = 1e-2
 | ||
|     Min_lr              = Init_lr * 0.01
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   optimizer_type  使用到的优化器种类,可选的有adam、sgd
 | ||
|     #                   当使用Adam优化器时建议设置  Init_lr=1e-3
 | ||
|     #                   当使用SGD优化器时建议设置   Init_lr=1e-2
 | ||
|     #   momentum        优化器内部使用到的momentum参数
 | ||
|     #   weight_decay    权值衰减,可防止过拟合
 | ||
|     #                   adam会导致weight_decay错误,使用adam时建议设置为0。
 | ||
|     #------------------------------------------------------------------#
 | ||
|     optimizer_type      = "sgd"
 | ||
|     momentum            = 0.937
 | ||
|     weight_decay        = 5e-4
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   lr_decay_type   使用到的学习率下降方式,可选的有step、cos
 | ||
|     #------------------------------------------------------------------#
 | ||
|     lr_decay_type       = "cos"
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   save_period     多少个epoch保存一次权值
 | ||
|     #------------------------------------------------------------------#
 | ||
|     save_period         = 10
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   save_dir        权值与日志文件保存的文件夹
 | ||
|     #------------------------------------------------------------------#
 | ||
|     save_dir            = 'logs'
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   eval_flag       是否在训练时进行评估,评估对象为验证集
 | ||
|     #                   安装pycocotools库后,评估体验更佳。
 | ||
|     #   eval_period     代表多少个epoch评估一次,不建议频繁的评估
 | ||
|     #                   评估需要消耗较多的时间,频繁评估会导致训练非常慢
 | ||
|     #   此处获得的mAP会与get_map.py获得的会有所不同,原因有二:
 | ||
|     #   (一)此处获得的mAP为验证集的mAP。
 | ||
|     #   (二)此处设置评估参数较为保守,目的是加快评估速度。
 | ||
|     #------------------------------------------------------------------#
 | ||
|     eval_flag           = True
 | ||
|     eval_period         = 10
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   num_workers     用于设置是否使用多线程读取数据
 | ||
|     #                   开启后会加快数据读取速度,但是会占用更多内存
 | ||
|     #                   内存较小的电脑可以设置为2或者0  
 | ||
|     #------------------------------------------------------------------#
 | ||
|     num_workers         = 6
 | ||
| 
 | ||
|     #------------------------------------------------------#
 | ||
|     #   train_annotation_path   训练图片路径和标签
 | ||
|     #   val_annotation_path     验证图片路径和标签
 | ||
|     #------------------------------------------------------#
 | ||
|     train_annotation_path   = 'model_data/2007_train.txt'
 | ||
|     val_annotation_path     = 'model_data/2007_val.txt'
 | ||
| 
 | ||
|     seed_everything(seed)
 | ||
|     #------------------------------------------------------#
 | ||
|     #   设置用到的显卡
 | ||
|     #------------------------------------------------------#
 | ||
|     ngpus_per_node  = torch.cuda.device_count()
 | ||
|     if distributed:
 | ||
|         dist.init_process_group(backend="nccl")
 | ||
|         local_rank  = int(os.environ["LOCAL_RANK"])
 | ||
|         rank        = int(os.environ["RANK"])
 | ||
|         device      = torch.device("cuda", local_rank)
 | ||
|         if local_rank == 0:
 | ||
|             print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
 | ||
|             print("Gpu Device Count : ", ngpus_per_node)
 | ||
|     else:
 | ||
|         device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 | ||
|         print("\033[1;33;44mRuning on {}\033[0m".format(device))
 | ||
|         local_rank      = 0
 | ||
|         rank            = 0
 | ||
| 
 | ||
|     #------------------------------------------------------#
 | ||
|     #   获取classes和anchor
 | ||
|     #------------------------------------------------------#
 | ||
|     class_names, num_classes = get_classes(classes_path)
 | ||
|     anchors, num_anchors     = get_anchors(anchors_path)
 | ||
| 
 | ||
|     #----------------------------------------------------#
 | ||
|     #   下载预训练权重
 | ||
|     #----------------------------------------------------#
 | ||
|     if pretrained:
 | ||
|         if distributed:
 | ||
|             if local_rank == 0:
 | ||
|                 download_weights(backbone, phi)  
 | ||
|             dist.barrier()
 | ||
|         else:
 | ||
|             download_weights(backbone, phi)
 | ||
| 
 | ||
|     #------------------------------------------------------#
 | ||
|     #   创建yolo模型
 | ||
|     #------------------------------------------------------#
 | ||
|     model = YoloBody(anchors_mask, num_classes, phi, backbone, pretrained=pretrained, input_shape=input_shape)
 | ||
|     if not pretrained:
 | ||
|         weights_init(model)
 | ||
|     if model_path != '':
 | ||
|         if local_rank == 0:
 | ||
|             print('Load weights {}.'.format(model_path))
 | ||
|         #------------------------------------------------------#
 | ||
|         #   根据预训练权重的Key和模型的Key进行加载
 | ||
|         #------------------------------------------------------#
 | ||
|         model_dict      = model.state_dict()
 | ||
|         pretrained_dict = torch.load(model_path, map_location = device)
 | ||
|         load_key, no_load_key, temp_dict = [], [], {}
 | ||
|         for k, v in pretrained_dict.items():
 | ||
|             if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
 | ||
|                 temp_dict[k] = v
 | ||
|                 load_key.append(k)
 | ||
|             else:
 | ||
|                 no_load_key.append(k)
 | ||
|         model_dict.update(temp_dict)
 | ||
|         model.load_state_dict(model_dict)
 | ||
|         #------------------------------------------------------#
 | ||
|         #   显示没有匹配上的Key
 | ||
|         #------------------------------------------------------#
 | ||
|         if local_rank == 0:
 | ||
|             print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key))
 | ||
|             print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key))
 | ||
|             print("\n\033[1;33;44m温馨提示,head部分没有载入是正常现象,Backbone部分没有载入是错误的。\033[0m")
 | ||
| 
 | ||
|     #----------------------#
 | ||
|     #   获得损失函数
 | ||
|     #----------------------#
 | ||
|     yolo_loss    = YOLOLoss(anchors, num_classes, input_shape, Cuda, anchors_mask, label_smoothing)
 | ||
|     #----------------------#
 | ||
|     #   记录Loss
 | ||
|     #----------------------#
 | ||
|     if local_rank == 0:
 | ||
|         time_str        = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S')
 | ||
|         log_dir         = os.path.join(save_dir, "loss_" + str(time_str))
 | ||
|         loss_history    = LossHistory(log_dir, model, input_shape=input_shape)
 | ||
|     else:
 | ||
|         loss_history    = None
 | ||
|         
 | ||
|     #------------------------------------------------------------------#
 | ||
|     #   torch 1.2不支持amp,建议使用torch 1.7.1及以上正确使用fp16
 | ||
|     #   因此torch1.2这里显示"could not be resolve"
 | ||
|     #------------------------------------------------------------------#
 | ||
|     if fp16:
 | ||
|         from torch.cuda.amp import GradScaler as GradScaler
 | ||
|         scaler = GradScaler()
 | ||
|     else:
 | ||
|         scaler = None
 | ||
| 
 | ||
|     model_train     = model.train()
 | ||
|     #----------------------------#
 | ||
|     #   多卡同步Bn
 | ||
|     #----------------------------#
 | ||
|     if sync_bn and ngpus_per_node > 1 and distributed:
 | ||
|         model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train)
 | ||
|     elif sync_bn:
 | ||
|         print("Sync_bn is not support in one gpu or not distributed.")
 | ||
| 
 | ||
|     if Cuda:
 | ||
|         if distributed:
 | ||
|             #----------------------------#
 | ||
|             #   多卡平行运行
 | ||
|             #----------------------------#
 | ||
|             model_train = model_train.cuda(local_rank)
 | ||
|             model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True)
 | ||
|         else:
 | ||
|             model_train = torch.nn.DataParallel(model)
 | ||
|             cudnn.benchmark = True
 | ||
|             model_train = model_train.cuda()
 | ||
|             
 | ||
|     #----------------------------#
 | ||
|     #   权值平滑
 | ||
|     #----------------------------#
 | ||
|     ema = ModelEMA(model_train)
 | ||
|     
 | ||
|     #---------------------------#
 | ||
|     #   读取数据集对应的txt
 | ||
|     #---------------------------#
 | ||
|     with open(train_annotation_path, encoding='utf-8') as f:
 | ||
|         train_lines = f.readlines()
 | ||
|     with open(val_annotation_path, encoding='utf-8') as f:
 | ||
|         val_lines   = f.readlines()
 | ||
|     num_train   = len(train_lines)
 | ||
|     num_val     = len(val_lines)
 | ||
| 
 | ||
|     if local_rank == 0:
 | ||
|         show_config(
 | ||
|             classes_path = classes_path, anchors_path = anchors_path, anchors_mask = anchors_mask, model_path = model_path, input_shape = input_shape, \
 | ||
|             Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \
 | ||
|             Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
 | ||
|             save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val
 | ||
|         )
 | ||
|         #---------------------------------------------------------#
 | ||
|         #   总训练世代指的是遍历全部数据的总次数
 | ||
|         #   总训练步长指的是梯度下降的总次数 
 | ||
|         #   每个训练世代包含若干训练步长,每个训练步长进行一次梯度下降。
 | ||
|         #   此处仅建议最低训练世代,上不封顶,计算时只考虑了解冻部分
 | ||
|         #----------------------------------------------------------#
 | ||
|         wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
 | ||
|         total_step  = num_train // Unfreeze_batch_size * UnFreeze_Epoch
 | ||
|         if total_step <= wanted_step:
 | ||
|             if num_train // Unfreeze_batch_size == 0:
 | ||
|                 raise ValueError('数据集过小,无法进行训练,请扩充数据集。')
 | ||
|             wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
 | ||
|             print("\n\033[1;33;44m[Warning] 使用%s优化器时,建议将训练总步长设置到%d以上。\033[0m"%(optimizer_type, wanted_step))
 | ||
|             print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%d,Unfreeze_batch_size为%d,共训练%d个Epoch,计算出总训练步长为%d。\033[0m"%(num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step))
 | ||
|             print("\033[1;33;44m[Warning] 由于总训练步长为%d,小于建议总步长%d,建议设置总世代为%d。\033[0m"%(total_step, wanted_step, wanted_epoch))
 | ||
| 
 | ||
|     #------------------------------------------------------#
 | ||
|     #   主干特征提取网络特征通用,冻结训练可以加快训练速度
 | ||
|     #   也可以在训练初期防止权值被破坏。
 | ||
|     #   Init_Epoch为起始世代
 | ||
|     #   Freeze_Epoch为冻结训练的世代
 | ||
|     #   UnFreeze_Epoch总训练世代
 | ||
|     #   提示OOM或者显存不足请调小Batch_size
 | ||
|     #------------------------------------------------------#
 | ||
|     if True:
 | ||
|         UnFreeze_flag = False
 | ||
|         #------------------------------------#
 | ||
|         #   冻结一定部分训练
 | ||
|         #------------------------------------#
 | ||
|         if Freeze_Train:
 | ||
|             for param in model.backbone.parameters():
 | ||
|                 param.requires_grad = False
 | ||
| 
 | ||
|         #-------------------------------------------------------------------#
 | ||
|         #   如果不冻结训练的话,直接设置batch_size为Unfreeze_batch_size
 | ||
|         #-------------------------------------------------------------------#
 | ||
|         batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size
 | ||
| 
 | ||
|         #-------------------------------------------------------------------#
 | ||
|         #   判断当前batch_size,自适应调整学习率
 | ||
|         #-------------------------------------------------------------------#
 | ||
|         nbs             = 64
 | ||
|         lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
 | ||
|         lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
 | ||
|         Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
 | ||
|         Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
 | ||
| 
 | ||
|         #---------------------------------------#
 | ||
|         #   根据optimizer_type选择优化器
 | ||
|         #---------------------------------------#
 | ||
|         pg0, pg1, pg2 = [], [], []  
 | ||
|         for k, v in model.named_modules():
 | ||
|             if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
 | ||
|                 pg2.append(v.bias)    
 | ||
|             if isinstance(v, nn.BatchNorm2d) or "bn" in k:
 | ||
|                 pg0.append(v.weight)    
 | ||
|             elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
 | ||
|                 pg1.append(v.weight)   
 | ||
|         optimizer = {
 | ||
|             'adam'  : optim.Adam(pg0, Init_lr_fit, betas = (momentum, 0.999)),
 | ||
|             'sgd'   : optim.SGD(pg0, Init_lr_fit, momentum = momentum, nesterov=True)
 | ||
|         }[optimizer_type]
 | ||
|         optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay})
 | ||
|         optimizer.add_param_group({"params": pg2})
 | ||
| 
 | ||
|         #---------------------------------------#
 | ||
|         #   获得学习率下降的公式
 | ||
|         #---------------------------------------#
 | ||
|         lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
 | ||
|         
 | ||
|         #---------------------------------------#
 | ||
|         #   判断每一个世代的长度
 | ||
|         #---------------------------------------#
 | ||
|         epoch_step      = num_train // batch_size
 | ||
|         epoch_step_val  = num_val // batch_size
 | ||
|         
 | ||
|         if epoch_step == 0 or epoch_step_val == 0:
 | ||
|             raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。")
 | ||
| 
 | ||
|         if ema:
 | ||
|             ema.updates     = epoch_step * Init_Epoch
 | ||
|         
 | ||
|         #---------------------------------------#
 | ||
|         #   构建数据集加载器。
 | ||
|         #---------------------------------------#
 | ||
|         train_dataset   = YoloDataset(train_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
 | ||
|                                         mosaic=mosaic, mixup=mixup, mosaic_prob=mosaic_prob, mixup_prob=mixup_prob, train=True, special_aug_ratio=special_aug_ratio)
 | ||
|         val_dataset     = YoloDataset(val_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
 | ||
|                                         mosaic=False, mixup=False, mosaic_prob=0, mixup_prob=0, train=False, special_aug_ratio=0)
 | ||
|         
 | ||
|         if distributed:
 | ||
|             train_sampler   = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,)
 | ||
|             val_sampler     = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False,)
 | ||
|             batch_size      = batch_size // ngpus_per_node
 | ||
|             shuffle         = False
 | ||
|         else:
 | ||
|             train_sampler   = None
 | ||
|             val_sampler     = None
 | ||
|             shuffle         = True
 | ||
| 
 | ||
|         gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
 | ||
|                                     drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
 | ||
|                                     worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
 | ||
|         gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
 | ||
|                                     drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
 | ||
|                                     worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
 | ||
| 
 | ||
|         #----------------------#
 | ||
|         #   记录eval的map曲线
 | ||
|         #----------------------#
 | ||
|         if local_rank == 0:
 | ||
|             eval_callback   = EvalCallback(model, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, Cuda, \
 | ||
|                                             eval_flag=eval_flag, period=eval_period)
 | ||
|         else:
 | ||
|             eval_callback   = None
 | ||
|         
 | ||
|         #---------------------------------------#
 | ||
|         #   开始模型训练
 | ||
|         #---------------------------------------#
 | ||
|         for epoch in range(Init_Epoch, UnFreeze_Epoch):
 | ||
|             #---------------------------------------#
 | ||
|             #   如果模型有冻结学习部分
 | ||
|             #   则解冻,并设置参数
 | ||
|             #---------------------------------------#
 | ||
|             if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
 | ||
|                 batch_size = Unfreeze_batch_size
 | ||
| 
 | ||
|                 #-------------------------------------------------------------------#
 | ||
|                 #   判断当前batch_size,自适应调整学习率
 | ||
|                 #-------------------------------------------------------------------#
 | ||
|                 nbs             = 64
 | ||
|                 lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
 | ||
|                 lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
 | ||
|                 Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
 | ||
|                 Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
 | ||
|                 #---------------------------------------#
 | ||
|                 #   获得学习率下降的公式
 | ||
|                 #---------------------------------------#
 | ||
|                 lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
 | ||
| 
 | ||
|                 for param in model.backbone.parameters():
 | ||
|                     param.requires_grad = True
 | ||
| 
 | ||
|                 epoch_step      = num_train // batch_size
 | ||
|                 epoch_step_val  = num_val // batch_size
 | ||
| 
 | ||
|                 if epoch_step == 0 or epoch_step_val == 0:
 | ||
|                     raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。")
 | ||
|                     
 | ||
|                 if ema:
 | ||
|                     ema.updates     = epoch_step * epoch
 | ||
| 
 | ||
|                 if distributed:
 | ||
|                     batch_size  = batch_size // ngpus_per_node
 | ||
|                     
 | ||
|                 gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
 | ||
|                                             drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
 | ||
|                                             worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
 | ||
|                 gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
 | ||
|                                             drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
 | ||
|                                             worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
 | ||
| 
 | ||
|                 UnFreeze_flag   = True
 | ||
| 
 | ||
|             gen.dataset.epoch_now       = epoch
 | ||
|             gen_val.dataset.epoch_now   = epoch
 | ||
| 
 | ||
|             if distributed:
 | ||
|                 train_sampler.set_epoch(epoch)
 | ||
| 
 | ||
|             set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
 | ||
| 
 | ||
|             fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank)
 | ||
|             
 | ||
|             if distributed:
 | ||
|                 dist.barrier()
 | ||
| 
 | ||
|         if local_rank == 0:
 | ||
|             loss_history.writer.close() |