Initial

2024-06-13 12:13:54 +08:00
commit db40d1af1b
38 changed files with 5006 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+database/chestXray8_512/*
--- a/README.md
+++ b/README.md
@@ -0,0 +1,9 @@
+# rabat-illness-yolov5
+> 计算机视觉课程结课课题
+
+> 基于yolov5的胸部疾病目标检测
+
+
+# 数据集介绍
+
+数据集来自kaggle[数据集原地址]
--- a/config.ini
+++ b/config.ini
@@ -0,0 +1,45 @@
+[Train]
+;True使用cuda训练
+cuda                = True
+; 用于固定随机种子
+; 使得每次独立训练都可以获得一样的结果
+seed                = 12
+;distributed     用于指定是否使用单机多卡分布式运行
+;                终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
+;                Windows系统下默认使用DP模式调用所有显卡，不支持DDP。
+;DP模式：
+;    设置            distributed = False
+;    在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python train.py
+;DDP模式：
+;    设置            distributed = True
+;    在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
+distributed         = False
+; sync_bn     是否使用sync_bn，DDP模式多卡可用
+sync_bn             = False
+fp16                = False
+classes_path        = model_data/voc_classes.txt
+anchors_path        = model_data/yolo_anchors.txt
+anchors_mask        = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+model_path          = 
+input_shape         = [640, 640]
+backbone            = cspdarknet
+pretrained          = False
+phi                 = x
+; mosaic              马赛克数据增强。
+; mosaic_prob         每个step有多少概率使用mosaic数据增强，默认50%。
+
+; mixup               是否使用mixup数据增强，仅在mosaic=True时有效。
+;                     只会对mosaic增强后的图片进行mixup的处理。
+; mixup_prob          有多少概率在mosaic后使用mixup数据增强，默认50%。
+;                     总的mixup概率为mosaic_prob * mixup_prob。
+
+; special_aug_ratio   参考YoloX，由于Mosaic生成的训练图片，远远脱离自然图片的真实分布。
+;                     当mosaic=True时，本代码会在special_aug_ratio范围内开启mosaic。
+;                     默认为前70%个epoch，100个世代会开启70个世代。
+mosaic              = False
+mosaic_prob         = 0.5
+mixup               = True
+mixup_prob          = 0.5
+special_aug_ratio   = 0.7
+; label_smoothing     标签平滑。一般0.01以下。如0.01、0.005。
+label_smoothing     = 0.01
--- a/database/README.md
+++ b/database/README.md
--- a/logs/README.md
+++ b/logs/README.md
--- a/model_data/coco_classes.txt
+++ b/model_data/coco_classes.txt
@@ -0,0 +1,2 @@
+0
+1
--- a/model_data/simhei.ttf
+++ b/model_data/simhei.ttf
--- a/model_data/voc_classes.txt
+++ b/model_data/voc_classes.txt
@@ -0,0 +1,2 @@
+0
+1
--- a/model_data/yolo_anchors.txt
+++ b/model_data/yolo_anchors.txt
@@ -0,0 +1 @@
+10,13, 16,30, 33,23,  30,61, 62,45, 59,119,  116,90, 156,198, 373,326
--- a/model_data/yolov5_s_v6.1.pth
+++ b/model_data/yolov5_s_v6.1.pth
--- a/network/CSPdarknet.py
+++ b/network/CSPdarknet.py
@@ -0,0 +1,177 @@
+import torch
+import torch.nn as nn
+
+
+class SiLU(nn.Module):
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k] 
+    return p
+
+class Focus(nn.Module):
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super(Focus, self).__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
+
+    def forward(self, x):
+        # 320, 320, 12 => 320, 320, 64
+        return self.conv(
+            # 640, 640, 3 => 320, 320, 12
+            torch.cat(
+                [
+                    x[..., ::2, ::2], 
+                    x[..., 1::2, ::2], 
+                    x[..., ::2, 1::2], 
+                    x[..., 1::2, 1::2]
+                ], 1
+            )
+        )
+
+class Conv(nn.Module):
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super(Conv, self).__init__()
+        self.conv   = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn     = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)
+        self.act    = SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+        super(Bottleneck, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+class C3(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super(C3, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
+        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+        # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
+
+    def forward(self, x):
+        return self.cv3(torch.cat(
+            (
+                self.m(self.cv1(x)), 
+                self.cv2(x)
+            )
+            , dim=1))
+
+class SPP(nn.Module):
+    # Spatial pyramid pooling layer used in YOLOv3-SPP
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super(SPP, self).__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+    def forward(self, x):
+        x = self.cv1(x)
+        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+        
+class CSPDarknet(nn.Module):
+    def __init__(self, base_channels, base_depth, phi, pretrained):
+        super().__init__()
+        #-----------------------------------------------#
+        #   输入图片是640, 640, 3
+        #   初始的基本通道base_channels是64
+        #-----------------------------------------------#
+
+        #-----------------------------------------------#
+        #   利用focus网络结构进行特征提取
+        #   640, 640, 3 -> 320, 320, 12 -> 320, 320, 64
+        #-----------------------------------------------#
+        self.stem       = Focus(3, base_channels, k=3)
+        
+        #-----------------------------------------------#
+        #   完成卷积之后，320, 320, 64 -> 160, 160, 128
+        #   完成CSPlayer之后，160, 160, 128 -> 160, 160, 128
+        #-----------------------------------------------#
+        self.dark2 = nn.Sequential(
+            # 320, 320, 64 -> 160, 160, 128
+            Conv(base_channels, base_channels * 2, 3, 2),
+            # 160, 160, 128 -> 160, 160, 128
+            C3(base_channels * 2, base_channels * 2, base_depth),
+        )
+        
+        #-----------------------------------------------#
+        #   完成卷积之后，160, 160, 128 -> 80, 80, 256
+        #   完成CSPlayer之后，80, 80, 256 -> 80, 80, 256
+        #                   在这里引出有效特征层80, 80, 256
+        #                   进行加强特征提取网络FPN的构建
+        #-----------------------------------------------#
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2),
+            C3(base_channels * 4, base_channels * 4, base_depth * 3),
+        )
+
+        #-----------------------------------------------#
+        #   完成卷积之后，80, 80, 256 -> 40, 40, 512
+        #   完成CSPlayer之后，40, 40, 512 -> 40, 40, 512
+        #                   在这里引出有效特征层40, 40, 512
+        #                   进行加强特征提取网络FPN的构建
+        #-----------------------------------------------#
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2),
+            C3(base_channels * 8, base_channels * 8, base_depth * 3),
+        )
+        
+        #-----------------------------------------------#
+        #   完成卷积之后，40, 40, 512 -> 20, 20, 1024
+        #   完成SPP之后，20, 20, 1024 -> 20, 20, 1024
+        #   完成CSPlayer之后，20, 20, 1024 -> 20, 20, 1024
+        #-----------------------------------------------#
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2),
+            SPP(base_channels * 16, base_channels * 16),
+            C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False),
+        )
+        if pretrained:
+            url = {
+                's' : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_s_backbone.pth',
+                'm' : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_m_backbone.pth',
+                'l' : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_l_backbone.pth',
+                'x' : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_x_backbone.pth',
+            }[phi]
+            checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
+            self.load_state_dict(checkpoint, strict=False)
+            print("Load weights from ", url.split('/')[-1])
+            
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.dark2(x)
+        #-----------------------------------------------#
+        #   dark3的输出为80, 80, 256，是一个有效特征层
+        #-----------------------------------------------#
+        x = self.dark3(x)
+        feat1 = x
+        #-----------------------------------------------#
+        #   dark4的输出为40, 40, 512，是一个有效特征层
+        #-----------------------------------------------#
+        x = self.dark4(x)
+        feat2 = x
+        #-----------------------------------------------#
+        #   dark5的输出为20, 20, 1024，是一个有效特征层
+        #-----------------------------------------------#
+        x = self.dark5(x)
+        feat3 = x
+        return feat1, feat2, feat3
--- a/network/ConvNext.py
+++ b/network/ConvNext.py
@@ -0,0 +1,249 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob       = 1 - drop_prob
+    shape           = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor   = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+class DropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+        def norm_cdf(x):
+            return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+        with torch.no_grad():
+            l = norm_cdf((a - mean) / std)
+            u = norm_cdf((b - mean) / std)
+
+            tensor.uniform_(2 * l - 1, 2 * u - 1)
+            tensor.erfinv_()
+
+            tensor.mul_(std * math.sqrt(2.))
+            tensor.add_(mean)
+
+            tensor.clamp_(min=a, max=b)
+            return tensor
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+#--------------------------------------#
+#   Gelu激活函数的实现
+#   利用近似的数学公式
+#--------------------------------------#
+class GELU(nn.Module):
+    def __init__(self):
+        super(GELU, self).__init__()
+
+    def forward(self, x):
+        return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x,3))))
+    
+#---------------------------------------------------------------------------------#
+#   LayerNorm 支持两种形式channels_last (default) or channels_first. 
+#   channels_last   对应具有形状的输入(batch_size, height, width, channels) 
+#   channels_first  对应具有形状的输入(batch_size, channels, height, width).   
+#---------------------------------------------------------------------------------#
+class LayerNorm(nn.Module):
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias   = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError 
+        self.normalized_shape = (normalized_shape, )
+    
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+#--------------------------------------------------------------------------------------------------------------#
+#   ConvNeXt Block有两种等效的实现:
+#   (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+#   (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+#   代码中使用（2），因为这个在PyTorch中稍微快一点
+#--------------------------------------------------------------------------------------------------------------#
+class Block(nn.Module):
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        #--------------------------#
+        #   7x7的逐层卷积
+        #--------------------------#
+        self.dwconv     = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
+        self.norm       = LayerNorm(dim, eps=1e-6)
+        #--------------------------#
+        #   利用全连接层代替1x1卷积
+        #--------------------------#
+        self.pwconv1    = nn.Linear(dim, 4 * dim)
+        self.act        = GELU()
+        #--------------------------#
+        #   利用全连接层代替1x1卷积
+        #--------------------------#
+        self.pwconv2    = nn.Linear(4 * dim, dim)
+        #--------------------------#
+        #   加入缩放系数
+        #--------------------------#
+        self.gamma      = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) if layer_scale_init_value > 0 else None
+        #--------------------------#
+        #   加入Drop_path正则化
+        #--------------------------#
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        #--------------------------#
+        #   7x7的逐层卷积
+        #--------------------------#
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        #--------------------------#
+        #   利用全连接层代替1x1卷积
+        #--------------------------#
+        x = self.pwconv1(x)
+        x = self.act(x)
+        #--------------------------#
+        #   利用全连接层代替1x1卷积
+        #--------------------------#
+        x = self.pwconv2(x)
+        #--------------------------#
+        #   加入缩放系数
+        #--------------------------#
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        #--------------------------#
+        #   加入Drop_path正则化
+        #--------------------------#
+        x = input + self.drop_path(x)
+        return x
+
+#-----------------------------------------------------#
+#   ConvNeXt
+#   A PyTorch impl of : `A ConvNet for the 2020s`
+#   https://arxiv.org/pdf/2201.03545.pdf
+#-----------------------------------------------------#
+class ConvNeXt(nn.Module):
+    def __init__(
+        self, in_chans=3, num_classes=1000, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], 
+        drop_path_rate=0., layer_scale_init_value=1e-6, head_init_scale=1., **kwargs
+    ):
+        super().__init__()
+
+        self.downsample_layers = nn.ModuleList()
+        #--------------------------------------------------#
+        #   bs, 3, 224, 224 -> bs, 96, 56, 56
+        #--------------------------------------------------#
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        
+        #--------------------------------------------------#
+        #   定义三次下采样的过程
+        #   利用步长为2x2，卷积核大小为2x2的卷积进行下采样
+        #--------------------------------------------------#
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        #--------------------------------------------------#
+        #   根据深度的不同，定义不同的drop率
+        #--------------------------------------------------#
+        self.stages = nn.ModuleList()
+        dp_rates    = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
+        cur         = 0
+        #--------------------------------------------------#
+        #   整个ConvNeXt除了Stem外，存在四个Stage
+        #   每个Stage里面是多个ConvNeXt Block的堆叠。
+        #--------------------------------------------------#
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j], layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        outs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            if i != 0:
+                outs.append(x)
+        return outs
+
+model_urls = {
+    "convnext_tiny_1k"      : "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/convnext_tiny_1k_224_ema_no_jit.pth",
+    "convnext_small_1k"     : "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/convnext_small_1k_224_ema_no_jit.pth",
+}
+
+#------------------------------------------------------#
+#   Tiny约等于Cspdarknet-L的尺寸
+#------------------------------------------------------#
+def ConvNeXt_Tiny(pretrained=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_tiny_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
+        model.load_state_dict(checkpoint, strict=False)
+        print("Load weights from ", url.split('/')[-1])
+    return model
+
+#------------------------------------------------------#
+#   Tiny约等于Cspdarknet-X的尺寸
+#------------------------------------------------------#
+def ConvNeXt_Small(pretrained=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_small_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
+        model.load_state_dict(checkpoint, strict=False)
+        print("Load weights from ", url.split('/')[-1])
+    return model
--- a/network/Swin_transformer.py
+++ b/network/Swin_transformer.py
@@ -0,0 +1,638 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+        def norm_cdf(x):
+            return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+        with torch.no_grad():
+            l = norm_cdf((a - mean) / std)
+            u = norm_cdf((b - mean) / std)
+
+            tensor.uniform_(2 * l - 1, 2 * u - 1)
+            tensor.erfinv_()
+
+            tensor.mul_(std * math.sqrt(2.))
+            tensor.add_(mean)
+
+            tensor.clamp_(min=a, max=b)
+            return tensor
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+#--------------------------------------#
+#   Gelu激活函数的实现
+#   利用近似的数学公式
+#--------------------------------------#
+class GELU(nn.Module):
+    def __init__(self):
+        super(GELU, self).__init__()
+
+    def forward(self, x):
+        return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x,3))))
+
+#-------------------------------------------------------#
+#   对输入进来的图片进行高和宽的压缩
+#   并且进行通道的扩张。
+#-------------------------------------------------------#
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=[224, 224], patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        # [224, 224]
+        self.img_size           = img_size
+        # [4, 4]
+        self.patch_size         = [patch_size, patch_size]
+        # [56, 56]
+        self.patches_resolution = [self.img_size[0] // self.patch_size[0], self.img_size[1] // self.patch_size[1]]
+
+        # 3136
+        self.num_patches        = self.patches_resolution[0] * self.patches_resolution[1]
+        # 3
+        self.in_chans           = in_chans
+        # 96
+        self.embed_dim          = embed_dim
+
+        #-------------------------------------------------------#
+        #   bs, 224, 224, 3 -> bs, 56, 56, 96
+        #-------------------------------------------------------#
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]} * {self.img_size[1]})."
+        #-------------------------------------------------------#
+        #   bs, 224, 224, 3 -> bs, 56, 56, 96 -> bs, 3136, 96
+        #-------------------------------------------------------#
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+def window_partition(x, window_size):
+    B, H, W, C  = x.shape
+    #------------------------------------------------------------------#
+    #   bs, 56, 56, 96 -> bs, 8, 7, 8, 7, 96 -> bs * 64, 7, 7, 96
+    #------------------------------------------------------------------#
+    x           = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows     = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+def window_reverse(windows, window_size, H, W):
+    #------------------------------------------------------------------#
+    #   bs * 64, 7, 7, 96 -> bs, 8, 8, 7, 7, 96 -> bs, 56, 56, 96
+    #------------------------------------------------------------------#
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim            = dim
+        self.window_size    = window_size  # Wh, Ww
+        self.num_heads      = num_heads
+        head_dim            = dim // num_heads
+        self.scale          = qk_scale or head_dim ** -0.5
+
+        #--------------------------------------------------------------------------#
+        #   相对坐标矩阵，用于表示每个窗口内，其它点相对于自己的坐标
+        #   由于相对坐标取值范围为-6 ~ +6。中间共13个值，因此需要13 * 13
+        #   13 * 13, num_heads
+        #--------------------------------------------------------------------------#
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        ) 
+        
+        #--------------------------------------------------------------------------#
+        #   该部分用于获取7x7的矩阵内部，其它特征点相对于自身相对坐标
+        #--------------------------------------------------------------------------#
+        coords_h    = torch.arange(self.window_size[0])
+        coords_w    = torch.arange(self.window_size[1])
+        coords      = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten  = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0]    += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1]    += self.window_size[1] - 1
+        relative_coords[:, :, 0]    *= 2 * self.window_size[1] - 1
+        relative_position_index     = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        #--------------------------------------------------------------------------#
+        #   乘积获得q、k、v，用于计算多头注意力机制
+        #--------------------------------------------------------------------------#
+        self.qkv        = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop  = nn.Dropout(attn_drop)
+        self.proj       = nn.Linear(dim, dim)
+        self.proj_drop  = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        B_, N, C    = x.shape
+        #--------------------------------------------------------------------------#
+        #   bs * 64, 49, 96 -> bs * 64, 49, 96 * 3 -> 
+        #   bs * 64, 49, 3, num_heads, 32 -> 3, bs * 64, num_head, 49, 32    
+        #--------------------------------------------------------------------------#
+        qkv         = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        #--------------------------------------------------------------------------#
+        #   bs * 64, num_head, 49, 32   
+        #--------------------------------------------------------------------------#
+        q, k, v     = qkv[0], qkv[1], qkv[2] 
+
+        #--------------------------------------------------------------------------#
+        #   bs * 64, num_head, 49, 49
+        #--------------------------------------------------------------------------#
+        q       = q * self.scale
+        attn    = (q @ k.transpose(-2, -1))
+
+        #--------------------------------------------------------------------------#
+        #   这一步是根据已经求得的注意力，加上相对坐标的偏执量
+        #   形成最后的注意力
+        #--------------------------------------------------------------------------#
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        #--------------------------------------------------------------------------#
+        #   加上mask，保证分区。
+        #   bs * 64, num_head, 49, 49
+        #--------------------------------------------------------------------------#
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        #---------------------------------------------------------------------------------------#
+        #   bs * 64, num_head, 49, 49 @ bs * 64, num_head, 49, 32 -> bs * 64, num_head, 49, 32
+        #    
+        #   bs * 64, num_head, 49, 32 -> bs * 64, 49, 96
+        #---------------------------------------------------------------------------------------#
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob       = 1 - drop_prob
+    shape           = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor   = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+
+#-------------------------------------------------------#
+#   两次全连接
+#-------------------------------------------------------#
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+#-------------------------------------------------------#
+#   每个阶段重复的基础模块
+#   在这其中会使用WindowAttention进行特征提取
+#-------------------------------------------------------#
+class SwinTransformerBlock(nn.Module):
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim                = dim
+        self.input_resolution   = input_resolution
+        self.num_heads          = num_heads
+        self.window_size        = window_size
+        self.shift_size         = shift_size
+
+        self.mlp_ratio          = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1  = norm_layer(dim)
+        self.attn   = WindowAttention(
+            dim, 
+            window_size = [self.window_size, self.window_size], 
+            num_heads   = num_heads,
+            qkv_bias    = qkv_bias, 
+            qk_scale    = qk_scale, 
+            attn_drop   = attn_drop, 
+            proj_drop   = drop
+        )
+
+        self.drop_path  = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2      = norm_layer(dim)
+        mlp_hidden_dim  = int(dim * mlp_ratio)
+        self.mlp        = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            #----------------------------------------------------------------#
+            #   由于进行特征提取时，会对输入的特征层进行的平移
+            #   如：
+            #   [                                   [
+            #       [1, 2, 3],                          [5, 6, 4],   
+            #       [4, 5, 6],          -->             [8, 9, 7],
+            #       [7, 8, 9],                          [1, 2, 3],
+            #   ]                                   ]
+            #   这一步的作用就是使得平移后的区域块只计算自己部分的注意力机制
+            #----------------------------------------------------------------#
+            H, W = self.input_resolution
+            _H, _W  =  _make_divisible(H, self.window_size), _make_divisible(W, self.window_size),
+            img_mask = torch.zeros((1, _H, _W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask       = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask       = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+            self.attn_mask  = attn_mask.cpu().numpy()
+        else:
+            self.attn_mask = None
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        #-----------------------------------------------#
+        #   bs, 3136, 96 -> bs, 56, 56, 96
+        #-----------------------------------------------#
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        _H, _W  =  _make_divisible(H, self.window_size), _make_divisible(W, self.window_size),
+        x       = x.permute(0, 3, 1, 2)
+        x       = F.interpolate(x, [_H, _W], mode='bicubic', align_corners=False).permute(0, 2, 3, 1)
+
+        #-----------------------------------------------#
+        #   进行特征层的平移
+        #-----------------------------------------------#
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        #------------------------------------------------------------------------------------------#
+        #   bs, 56, 56, 96 -> bs * 64, 7, 7, 96 -> bs * 64, 49, 96
+        #------------------------------------------------------------------------------------------#
+        x_windows = window_partition(shifted_x, self.window_size)  # num_windows * B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        #-----------------------------------------------#
+        #   bs * 64, 49, 97 -> bs * 64, 49, 97
+        #-----------------------------------------------#
+        if type(self.attn_mask) != type(None):
+            attn_mask = torch.tensor(self.attn_mask).cuda() if x.is_cuda else torch.tensor(self.attn_mask)
+        else:
+            attn_mask = None
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        #-----------------------------------------------#
+        #   bs * 64, 49, 97 -> bs, 56, 56, 96
+        #-----------------------------------------------#
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, _H, _W)  # B H' W' C
+
+        #-----------------------------------------------#
+        #   将特征层平移回来
+        #-----------------------------------------------#
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        
+        x = x.permute(0, 3, 1, 2)
+        x = F.interpolate(x, [H, W], mode='bicubic', align_corners=False).permute(0, 2, 3, 1)
+        #-----------------------------------------------#
+        #   bs, 3136, 96
+        #-----------------------------------------------#
+        x = x.view(B, H * W, C)
+        #-----------------------------------------------#
+        #   FFN
+        #   bs, 3136, 96
+        #-----------------------------------------------#
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+#-------------------------------------------------------#
+#   对输入进来的特征层进行高和宽的压缩
+#   进行跨特征点的特征提取，提取完成后进行堆叠。
+#-------------------------------------------------------#
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution   = input_resolution
+        self.dim                = dim
+
+        self.norm               = norm_layer(4 * dim)
+        self.reduction          = nn.Linear(4 * dim, 2 * dim, bias=False)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        #-------------------------------------------------------#
+        #   bs, 3136, 96 -> bs, 56, 56, 96
+        #-------------------------------------------------------#
+        x = x.view(B, H, W, C)
+
+        #-------------------------------------------------------#
+        #   x0 ~ x3   bs, 56, 56, 96 -> bs, 28, 28, 96
+        #-------------------------------------------------------#
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        
+        #-------------------------------------------------------#
+        #   4 X bs, 28, 28, 96 -> bs, 28, 28, 384
+        #-------------------------------------------------------#
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        #-------------------------------------------------------#
+        #   bs, 28, 28, 384 -> bs, 784, 384
+        #-------------------------------------------------------#
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        #-------------------------------------------------------#
+        #   bs, 784, 384 -> bs, 784, 192
+        #-------------------------------------------------------#
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+
+
+#-------------------------------------------------------#
+#   Swin-Transformer的基础模块。
+#   使用窗口多头注意力机制进行特征提取。
+#   使用PatchMerging进行高和宽的压缩。
+#-------------------------------------------------------#
+class BasicLayer(nn.Module):
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
+        super().__init__()
+        #-------------------------------------------------------#
+        #   四个阶段对应不同的dim
+        #   [96, 192, 384, 768]
+        #-------------------------------------------------------#
+        self.dim                = dim
+        #-------------------------------------------------------#
+        #   四个阶段对应不同的输入分辨率
+        #   [[56, 56], [28, 28], [14, 14], [7, 7]]
+        #-------------------------------------------------------#
+        self.input_resolution   = input_resolution
+        #-------------------------------------------------------#
+        #   四个阶段对应不同的多头注意力机制重复次数  
+        #   [2, 2, 6, 2]
+        #-------------------------------------------------------#
+        self.depth              = depth
+        self.use_checkpoint     = use_checkpoint
+
+        #-------------------------------------------------------#
+        #   根据depth的次数利用窗口多头注意力机制进行特征提取。
+        #-------------------------------------------------------#
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim         = dim, 
+                    input_resolution = input_resolution,
+                    num_heads   = num_heads, 
+                    window_size = window_size,
+                    shift_size  = 0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio   = mlp_ratio,
+                    qkv_bias    = qkv_bias, 
+                    qk_scale    = qk_scale,
+                    drop        = drop, 
+                    attn_drop   = attn_drop,
+                    drop_path   = drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer  = norm_layer
+                )
+                for i in range(depth)
+            ]
+        )
+
+        if downsample is not None:
+            #-------------------------------------------------------#
+            #   判断是否要进行下采样，即：高宽压缩
+            #-------------------------------------------------------#
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x_ = checkpoint.checkpoint(blk, x)
+            else:
+                x_ = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x_)
+        else:
+            x = x_
+        return x_, x
+
+class SwinTransformer(nn.Module):
+    def __init__(self, img_size=[640, 640], patch_size=4, in_chans=3, num_classes=1000,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+                 use_checkpoint=False, **kwargs):
+        super().__init__()
+        self.num_classes    = num_classes
+        self.num_layers     = len(depths)
+        self.embed_dim      = embed_dim
+        self.ape            = ape
+        self.patch_norm     = patch_norm
+        self.num_features   = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio      = mlp_ratio
+        
+        #--------------------------------------------------#
+        #   bs, 224, 224, 3 -> bs, 3136, 96
+        #--------------------------------------------------#
+        self.patch_embed = PatchEmbed(
+            img_size    = img_size, 
+            patch_size  = patch_size,
+            in_chans    = in_chans, 
+            embed_dim   = embed_dim,
+            norm_layer  = norm_layer if self.patch_norm else None
+        )
+
+        #--------------------------------------------------#
+        #   PatchEmbed之后的图像序列长度        3136
+        #   PatchEmbed之后的图像对应的分辨率    [56, 56]
+        #--------------------------------------------------#
+        num_patches             = self.patch_embed.num_patches
+        patches_resolution      = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        #--------------------------------------------------#
+        #   stochastic depth
+        #--------------------------------------------------#
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        #---------------------------------------------------------------#
+        #   构建swin-transform的每个阶段
+        #   bs, 3136, 96 -> bs, 784, 192 -> bs, 196, 384 -> bs, 49, 768
+        #---------------------------------------------------------------#
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim                 = int(embed_dim * 2 ** i_layer),
+                input_resolution    = (patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer)),
+                depth               = depths[i_layer],
+                num_heads           = num_heads[i_layer],
+                window_size         = window_size,
+                mlp_ratio           = self.mlp_ratio,
+                qkv_bias            = qkv_bias, 
+                qk_scale            = qk_scale,
+                drop                = drop_rate, 
+                attn_drop           = attn_drop_rate,
+                drop_path           = dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer          = norm_layer,
+                downsample          = PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint      = use_checkpoint
+            )
+            self.layers.append(layer)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        inverval_outs = []
+        for i, layer in enumerate(self.layers):
+            x_, x = layer(x)
+            if i != 0:
+                inverval_outs.append(x_)
+        
+        outs = []
+        for i, layer in enumerate(inverval_outs):
+            H, W    = (self.patches_resolution[0] // (2 ** (i + 1)), self.patches_resolution[1] // (2 ** (i + 1)))
+            B, L, C = layer.shape
+            layer   = layer.view([B, H, W, C]).permute([0, 3, 1, 2])
+            outs.append(layer)
+
+        return outs
+    
+def Swin_transformer_Tiny(pretrained = False, input_shape = [640, 640], **kwargs):
+    model = SwinTransformer(input_shape, depths=[2, 2, 6, 2], **kwargs)
+    if pretrained:
+        url = "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/swin_tiny_patch4_window7.pth"
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
+        model.load_state_dict(checkpoint, strict=False)
+        print("Load weights from ", url.split('/')[-1])
+        
+    return model
--- a/network/init.py
+++ b/network/init.py
@@ -0,0 +1 @@
+#
--- a/network/pycache/CSPdarknet.cpython-310.pyc
+++ b/network/pycache/CSPdarknet.cpython-310.pyc
--- a/network/pycache/ConvNext.cpython-310.pyc
+++ b/network/pycache/ConvNext.cpython-310.pyc
--- a/network/pycache/Swin_transformer.cpython-310.pyc
+++ b/network/pycache/Swin_transformer.cpython-310.pyc
--- a/network/pycache/init.cpython-310.pyc
+++ b/network/pycache/init.cpython-310.pyc
--- a/network/pycache/yolo.cpython-310.pyc
+++ b/network/pycache/yolo.cpython-310.pyc
--- a/network/pycache/yolo_training.cpython-310.pyc
+++ b/network/pycache/yolo_training.cpython-310.pyc
--- a/network/yolo.py
+++ b/network/yolo.py
@@ -0,0 +1,132 @@
+import torch
+import torch.nn as nn
+
+from .ConvNext import ConvNeXt_Small, ConvNeXt_Tiny
+from .CSPdarknet import C3, Conv, CSPDarknet
+from .Swin_transformer import Swin_transformer_Tiny
+
+
+#---------------------------------------------------#
+#   yolo_body
+#---------------------------------------------------#
+class YoloBody(nn.Module):
+    def __init__(self, anchors_mask, num_classes, phi, backbone='cspdarknet', pretrained=False, input_shape=[640, 640]):
+        super(YoloBody, self).__init__()
+        depth_dict          = {'s' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.33,}
+        width_dict          = {'s' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,}
+        dep_mul, wid_mul    = depth_dict[phi], width_dict[phi]
+
+        base_channels       = int(wid_mul * 64)  # 64
+        base_depth          = max(round(dep_mul * 3), 1)  # 3
+        #-----------------------------------------------#
+        #   输入图片是640, 640, 3
+        #   初始的基本通道是64
+        #-----------------------------------------------#
+        self.backbone_name  = backbone
+        if backbone == "cspdarknet":
+            #---------------------------------------------------#   
+            #   生成CSPdarknet53的主干模型
+            #   获得三个有效特征层，他们的shape分别是：
+            #   80,80,256
+            #   40,40,512
+            #   20,20,1024
+            #---------------------------------------------------#
+            self.backbone   = CSPDarknet(base_channels, base_depth, phi, pretrained)
+        else:
+            #---------------------------------------------------#   
+            #   如果输入不为cspdarknet，则调整通道数
+            #   使其符合YoloV5的格式
+            #---------------------------------------------------#
+            self.backbone       = {
+                'convnext_tiny'         : ConvNeXt_Tiny,
+                'convnext_small'        : ConvNeXt_Small,
+                'swin_transfomer_tiny'  : Swin_transformer_Tiny,
+            }[backbone](pretrained=pretrained, input_shape=input_shape)
+            in_channels         = {
+                'convnext_tiny'         : [192, 384, 768],
+                'convnext_small'        : [192, 384, 768],
+                'swin_transfomer_tiny'  : [192, 384, 768],
+            }[backbone]
+            feat1_c, feat2_c, feat3_c = in_channels 
+            self.conv_1x1_feat1 = Conv(feat1_c, base_channels * 4, 1, 1)
+            self.conv_1x1_feat2 = Conv(feat2_c, base_channels * 8, 1, 1)
+            self.conv_1x1_feat3 = Conv(feat3_c, base_channels * 16, 1, 1)
+            
+        self.upsample   = nn.Upsample(scale_factor=2, mode="nearest")
+
+        self.conv_for_feat3         = Conv(base_channels * 16, base_channels * 8, 1, 1)
+        self.conv3_for_upsample1    = C3(base_channels * 16, base_channels * 8, base_depth, shortcut=False)
+
+        self.conv_for_feat2         = Conv(base_channels * 8, base_channels * 4, 1, 1)
+        self.conv3_for_upsample2    = C3(base_channels * 8, base_channels * 4, base_depth, shortcut=False)
+
+        self.down_sample1           = Conv(base_channels * 4, base_channels * 4, 3, 2)
+        self.conv3_for_downsample1  = C3(base_channels * 8, base_channels * 8, base_depth, shortcut=False)
+
+        self.down_sample2           = Conv(base_channels * 8, base_channels * 8, 3, 2)
+        self.conv3_for_downsample2  = C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False)
+
+        # 80, 80, 256 => 80, 80, 3 * (5 + num_classes) => 80, 80, 3 * (4 + 1 + num_classes)
+        self.yolo_head_P3 = nn.Conv2d(base_channels * 4, len(anchors_mask[2]) * (5 + num_classes), 1)
+        # 40, 40, 512 => 40, 40, 3 * (5 + num_classes) => 40, 40, 3 * (4 + 1 + num_classes)
+        self.yolo_head_P4 = nn.Conv2d(base_channels * 8, len(anchors_mask[1]) * (5 + num_classes), 1)
+        # 20, 20, 1024 => 20, 20, 3 * (5 + num_classes) => 20, 20, 3 * (4 + 1 + num_classes)
+        self.yolo_head_P5 = nn.Conv2d(base_channels * 16, len(anchors_mask[0]) * (5 + num_classes), 1)
+
+    def forward(self, x):
+        #  backbone
+        feat1, feat2, feat3 = self.backbone(x)
+        if self.backbone_name != "cspdarknet":
+            feat1 = self.conv_1x1_feat1(feat1)
+            feat2 = self.conv_1x1_feat2(feat2)
+            feat3 = self.conv_1x1_feat3(feat3)
+
+        # 20, 20, 1024 -> 20, 20, 512
+        P5          = self.conv_for_feat3(feat3)
+        # 20, 20, 512 -> 40, 40, 512
+        P5_upsample = self.upsample(P5)
+        # 40, 40, 512 -> 40, 40, 1024
+        P4          = torch.cat([P5_upsample, feat2], 1)
+        # 40, 40, 1024 -> 40, 40, 512
+        P4          = self.conv3_for_upsample1(P4)
+
+        # 40, 40, 512 -> 40, 40, 256
+        P4          = self.conv_for_feat2(P4)
+        # 40, 40, 256 -> 80, 80, 256
+        P4_upsample = self.upsample(P4)
+        # 80, 80, 256 cat 80, 80, 256 -> 80, 80, 512
+        P3          = torch.cat([P4_upsample, feat1], 1)
+        # 80, 80, 512 -> 80, 80, 256
+        P3          = self.conv3_for_upsample2(P3)
+        
+        # 80, 80, 256 -> 40, 40, 256
+        P3_downsample = self.down_sample1(P3)
+        # 40, 40, 256 cat 40, 40, 256 -> 40, 40, 512
+        P4 = torch.cat([P3_downsample, P4], 1)
+        # 40, 40, 512 -> 40, 40, 512
+        P4 = self.conv3_for_downsample1(P4)
+
+        # 40, 40, 512 -> 20, 20, 512
+        P4_downsample = self.down_sample2(P4)
+        # 20, 20, 512 cat 20, 20, 512 -> 20, 20, 1024
+        P5 = torch.cat([P4_downsample, P5], 1)
+        # 20, 20, 1024 -> 20, 20, 1024
+        P5 = self.conv3_for_downsample2(P5)
+
+        #---------------------------------------------------#
+        #   第三个特征层
+        #   y3=(batch_size,75,80,80)
+        #---------------------------------------------------#
+        out2 = self.yolo_head_P3(P3)
+        #---------------------------------------------------#
+        #   第二个特征层
+        #   y2=(batch_size,75,40,40)
+        #---------------------------------------------------#
+        out1 = self.yolo_head_P4(P4)
+        #---------------------------------------------------#
+        #   第一个特征层
+        #   y1=(batch_size,75,20,20)
+        #---------------------------------------------------#
+        out0 = self.yolo_head_P5(P5)
+        return out0, out1, out2
+
--- a/network/yolo_training.py
+++ b/network/yolo_training.py
@@ -0,0 +1,465 @@
+import math
+from copy import deepcopy
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class YOLOLoss(nn.Module):
+    def __init__(self, anchors, num_classes, input_shape, cuda, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0):
+        super(YOLOLoss, self).__init__()
+        #-----------------------------------------------------------#
+        #   20x20的特征层对应的anchor是[116,90],[156,198],[373,326]
+        #   40x40的特征层对应的anchor是[30,61],[62,45],[59,119]
+        #   80x80的特征层对应的anchor是[10,13],[16,30],[33,23]
+        #-----------------------------------------------------------#
+        self.anchors        = anchors
+        self.num_classes    = num_classes
+        self.bbox_attrs     = 5 + num_classes
+        self.input_shape    = input_shape
+        self.anchors_mask   = anchors_mask
+        self.label_smoothing = label_smoothing
+
+        self.threshold      = 4
+
+        self.balance        = [0.4, 1.0, 4]
+        self.box_ratio      = 0.05
+        self.obj_ratio      = 1 * (input_shape[0] * input_shape[1]) / (640 ** 2)
+        self.cls_ratio      = 0.5 * (num_classes / 80)
+        self.cuda = cuda
+
+    def clip_by_tensor(self, t, t_min, t_max):
+        t = t.float()
+        result = (t >= t_min).float() * t + (t < t_min).float() * t_min
+        result = (result <= t_max).float() * result + (result > t_max).float() * t_max
+        return result
+
+    def MSELoss(self, pred, target):
+        return torch.pow(pred - target, 2)
+
+    def BCELoss(self, pred, target):
+        epsilon = 1e-7
+        pred    = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
+        output  = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
+        return output
+        
+    def box_giou(self, b1, b2):
+        """
+        输入为：
+        ----------
+        b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
+        b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
+
+        返回为：
+        -------
+        giou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
+        """
+        #----------------------------------------------------#
+        #   求出预测框左上角右下角
+        #----------------------------------------------------#
+        b1_xy       = b1[..., :2]
+        b1_wh       = b1[..., 2:4]
+        b1_wh_half  = b1_wh/2.
+        b1_mins     = b1_xy - b1_wh_half
+        b1_maxes    = b1_xy + b1_wh_half
+        #----------------------------------------------------#
+        #   求出真实框左上角右下角
+        #----------------------------------------------------#
+        b2_xy       = b2[..., :2]
+        b2_wh       = b2[..., 2:4]
+        b2_wh_half  = b2_wh/2.
+        b2_mins     = b2_xy - b2_wh_half
+        b2_maxes    = b2_xy + b2_wh_half
+
+        #----------------------------------------------------#
+        #   求真实框和预测框所有的iou
+        #----------------------------------------------------#
+        intersect_mins  = torch.max(b1_mins, b2_mins)
+        intersect_maxes = torch.min(b1_maxes, b2_maxes)
+        intersect_wh    = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
+        intersect_area  = intersect_wh[..., 0] * intersect_wh[..., 1]
+        b1_area         = b1_wh[..., 0] * b1_wh[..., 1]
+        b2_area         = b2_wh[..., 0] * b2_wh[..., 1]
+        union_area      = b1_area + b2_area - intersect_area
+        iou             = intersect_area / union_area
+
+        #----------------------------------------------------#
+        #   找到包裹两个框的最小框的左上角和右下角
+        #----------------------------------------------------#
+        enclose_mins    = torch.min(b1_mins, b2_mins)
+        enclose_maxes   = torch.max(b1_maxes, b2_maxes)
+        enclose_wh      = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
+        #----------------------------------------------------#
+        #   计算对角线距离
+        #----------------------------------------------------#
+        enclose_area    = enclose_wh[..., 0] * enclose_wh[..., 1]
+        giou            = iou - (enclose_area - union_area) / enclose_area
+        
+        return giou
+
+    #---------------------------------------------------#
+    #   平滑标签
+    #---------------------------------------------------#
+    def smooth_labels(self, y_true, label_smoothing, num_classes):
+        return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes
+
+    def forward(self, l, input, targets=None, y_true=None):
+        #----------------------------------------------------#
+        #   l               代表使用的是第几个有效特征层
+        #   input的shape为  bs, 3*(5+num_classes), 20, 20
+        #                   bs, 3*(5+num_classes), 40, 40
+        #                   bs, 3*(5+num_classes), 80, 80
+        #   targets         真实框的标签情况 [batch_size, num_gt, 5]
+        #----------------------------------------------------#
+        #--------------------------------#
+        #   获得图片数量，特征层的高和宽
+        #   20, 20
+        #--------------------------------#
+        bs      = input.size(0)
+        in_h    = input.size(2)
+        in_w    = input.size(3)
+        #-----------------------------------------------------------------------#
+        #   计算步长
+        #   每一个特征点对应原来的图片上多少个像素点
+        #   [640, 640] 高的步长为640 / 20 = 32，宽的步长为640 / 20 = 32
+        #   如果特征层为20x20的话，一个特征点就对应原来的图片上的32个像素点
+        #   如果特征层为40x40的话，一个特征点就对应原来的图片上的16个像素点
+        #   如果特征层为80x80的话，一个特征点就对应原来的图片上的8个像素点
+        #   stride_h = stride_w = 32、16、8
+        #-----------------------------------------------------------------------#
+        stride_h = self.input_shape[0] / in_h
+        stride_w = self.input_shape[1] / in_w
+        #-------------------------------------------------#
+        #   此时获得的scaled_anchors大小是相对于特征层的
+        #-------------------------------------------------#
+        scaled_anchors  = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
+        #-----------------------------------------------#
+        #   输入的input一共有三个，他们的shape分别是
+        #   bs, 3 * (5+num_classes), 20, 20 => bs, 3, 5 + num_classes, 20, 20 => batch_size, 3, 20, 20, 5 + num_classes
+
+        #   batch_size, 3, 20, 20, 5 + num_classes
+        #   batch_size, 3, 40, 40, 5 + num_classes
+        #   batch_size, 3, 80, 80, 5 + num_classes
+        #-----------------------------------------------#
+        prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
+        
+        #-----------------------------------------------#
+        #   先验框的中心位置的调整参数
+        #-----------------------------------------------#
+        x = torch.sigmoid(prediction[..., 0])
+        y = torch.sigmoid(prediction[..., 1])
+        #-----------------------------------------------#
+        #   先验框的宽高调整参数
+        #-----------------------------------------------#
+        w = torch.sigmoid(prediction[..., 2]) 
+        h = torch.sigmoid(prediction[..., 3]) 
+        #-----------------------------------------------#
+        #   获得置信度，是否有物体
+        #-----------------------------------------------#
+        conf = torch.sigmoid(prediction[..., 4])
+        #-----------------------------------------------#
+        #   种类置信度
+        #-----------------------------------------------#
+        pred_cls = torch.sigmoid(prediction[..., 5:])
+        #-----------------------------------------------#
+        #   self.get_target已经合并到dataloader中
+        #   原因是在这里执行过慢，会大大延长训练时间
+        #-----------------------------------------------#
+        # y_true, noobj_mask = self.get_target(l, targets, scaled_anchors, in_h, in_w)
+
+        #---------------------------------------------------------------#
+        #   将预测结果进行解码，判断预测结果和真实值的重合程度
+        #   如果重合程度过大则忽略，因为这些特征点属于预测比较准确的特征点
+        #   作为负样本不合适
+        #----------------------------------------------------------------#
+        pred_boxes = self.get_pred_boxes(l, x, y, h, w, targets, scaled_anchors, in_h, in_w)
+
+        if self.cuda:
+            y_true          = y_true.type_as(x)
+        
+        loss    = 0
+        n       = torch.sum(y_true[..., 4] == 1)
+        if n != 0:
+            #---------------------------------------------------------------#
+            #   计算预测结果和真实结果的giou，计算对应有真实框的先验框的giou损失
+            #                         loss_cls计算对应有真实框的先验框的分类损失
+            #----------------------------------------------------------------#
+            giou        = self.box_giou(pred_boxes, y_true[..., :4]).type_as(x)
+            loss_loc    = torch.mean((1 - giou)[y_true[..., 4] == 1])
+            loss_cls    = torch.mean(self.BCELoss(pred_cls[y_true[..., 4] == 1], self.smooth_labels(y_true[..., 5:][y_true[..., 4] == 1], self.label_smoothing, self.num_classes)))
+            loss        += loss_loc * self.box_ratio + loss_cls * self.cls_ratio
+            #-----------------------------------------------------------#
+            #   计算置信度的loss
+            #   也就意味着先验框对应的预测框预测的更准确
+            #   它才是用来预测这个物体的。
+            #-----------------------------------------------------------#
+            tobj        = torch.where(y_true[..., 4] == 1, giou.detach().clamp(0), torch.zeros_like(y_true[..., 4]))
+        else:
+            tobj        = torch.zeros_like(y_true[..., 4])
+        loss_conf   = torch.mean(self.BCELoss(conf, tobj))
+        
+        loss        += loss_conf * self.balance[l] * self.obj_ratio
+        # if n != 0:
+        #     print(loss_loc * self.box_ratio, loss_cls * self.cls_ratio, loss_conf * self.balance[l] * self.obj_ratio)
+        return loss
+    
+    def get_near_points(self, x, y, i, j):
+        sub_x = x - i
+        sub_y = y - j
+        if sub_x > 0.5 and sub_y > 0.5:
+            return [[0, 0], [1, 0], [0, 1]]
+        elif sub_x < 0.5 and sub_y > 0.5:
+            return [[0, 0], [-1, 0], [0, 1]]
+        elif sub_x < 0.5 and sub_y < 0.5:
+            return [[0, 0], [-1, 0], [0, -1]]
+        else:
+            return [[0, 0], [1, 0], [0, -1]]
+
+    def get_target(self, l, targets, anchors, in_h, in_w):
+        #-----------------------------------------------------#
+        #   计算一共有多少张图片
+        #-----------------------------------------------------#
+        bs              = len(targets)
+        #-----------------------------------------------------#
+        #   用于选取哪些先验框不包含物体
+        #   bs, 3, 20, 20
+        #-----------------------------------------------------#
+        noobj_mask      = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
+        #-----------------------------------------------------#
+        #   帮助找到每一个先验框最对应的真实框
+        #-----------------------------------------------------#
+        box_best_ratio = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
+        #-----------------------------------------------------#
+        #   batch_size, 3, 20, 20, 5 + num_classes
+        #-----------------------------------------------------#
+        y_true          = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
+        for b in range(bs):            
+            if len(targets[b])==0:
+                continue
+            batch_target = torch.zeros_like(targets[b])
+            #-------------------------------------------------------#
+            #   计算出正样本在特征层上的中心点
+            #   获得真实框相对于特征层的大小
+            #-------------------------------------------------------#
+            batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
+            batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
+            batch_target[:, 4] = targets[b][:, 4]
+            batch_target = batch_target.cpu()
+            
+            #-----------------------------------------------------------------------------#
+            #   batch_target                                    : num_true_box, 5
+            #   batch_target[:, 2:4]                            : num_true_box, 2
+            #   torch.unsqueeze(batch_target[:, 2:4], 1)        : num_true_box, 1, 2
+            #   anchors                                         : 9, 2
+            #   torch.unsqueeze(torch.FloatTensor(anchors), 0)  : 1, 9, 2
+            #   ratios_of_gt_anchors    : num_true_box, 9, 2
+            #   ratios_of_anchors_gt    : num_true_box, 9, 2
+            #
+            #   ratios                  : num_true_box, 9, 4
+            #   max_ratios              : num_true_box, 9   
+            #   max_ratios每一个真实框和每一个先验框的最大宽高比！
+            #------------------------------------------------------------------------------#
+            ratios_of_gt_anchors = torch.unsqueeze(batch_target[:, 2:4], 1) / torch.unsqueeze(torch.FloatTensor(anchors), 0)
+            ratios_of_anchors_gt = torch.unsqueeze(torch.FloatTensor(anchors), 0) /  torch.unsqueeze(batch_target[:, 2:4], 1)
+            ratios               = torch.cat([ratios_of_gt_anchors, ratios_of_anchors_gt], dim = -1)
+            max_ratios, _        = torch.max(ratios, dim = -1)
+
+            for t, ratio in enumerate(max_ratios):
+                #-------------------------------------------------------#
+                #   ratio : 9
+                #-------------------------------------------------------#
+                over_threshold = ratio < self.threshold
+                over_threshold[torch.argmin(ratio)] = True
+                for k, mask in enumerate(self.anchors_mask[l]):
+                    if not over_threshold[mask]:
+                        continue
+                    #----------------------------------------#
+                    #   获得真实框属于哪个网格点
+                    #   x  1.25     => 1
+                    #   y  3.75     => 3
+                    #----------------------------------------#
+                    i = torch.floor(batch_target[t, 0]).long()
+                    j = torch.floor(batch_target[t, 1]).long()
+                    
+                    offsets = self.get_near_points(batch_target[t, 0], batch_target[t, 1], i, j)
+                    for offset in offsets:
+                        local_i = i + offset[0]
+                        local_j = j + offset[1]
+
+                        if local_i >= in_w or local_i < 0 or local_j >= in_h or local_j < 0:
+                            continue
+
+                        if box_best_ratio[b, k, local_j, local_i] != 0:
+                            if box_best_ratio[b, k, local_j, local_i] > ratio[mask]:
+                                y_true[b, k, local_j, local_i, :] = 0
+                            else:
+                                continue
+                            
+                        #----------------------------------------#
+                        #   取出真实框的种类
+                        #----------------------------------------#
+                        c = batch_target[t, 4].long()
+
+                        #----------------------------------------#
+                        #   noobj_mask代表无目标的特征点
+                        #----------------------------------------#
+                        noobj_mask[b, k, local_j, local_i] = 0
+                        #----------------------------------------#
+                        #   tx、ty代表中心调整参数的真实值
+                        #----------------------------------------#
+                        y_true[b, k, local_j, local_i, 0] = batch_target[t, 0]
+                        y_true[b, k, local_j, local_i, 1] = batch_target[t, 1]
+                        y_true[b, k, local_j, local_i, 2] = batch_target[t, 2]
+                        y_true[b, k, local_j, local_i, 3] = batch_target[t, 3]
+                        y_true[b, k, local_j, local_i, 4] = 1
+                        y_true[b, k, local_j, local_i, c + 5] = 1
+                        #----------------------------------------#
+                        #   获得当前先验框最好的比例
+                        #----------------------------------------#
+                        box_best_ratio[b, k, local_j, local_i] = ratio[mask]
+                        
+        return y_true, noobj_mask
+
+    def get_pred_boxes(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w):
+        #-----------------------------------------------------#
+        #   计算一共有多少张图片
+        #-----------------------------------------------------#
+        bs = len(targets)
+
+        #-----------------------------------------------------#
+        #   生成网格，先验框中心，网格左上角
+        #-----------------------------------------------------#
+        grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
+            int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)
+        grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
+            int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)
+
+        # 生成先验框的宽高
+        scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
+        anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)
+        anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
+        
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
+        #-------------------------------------------------------#
+        #   计算调整后的先验框中心与宽高
+        #-------------------------------------------------------#
+        pred_boxes_x    = torch.unsqueeze(x * 2. - 0.5 + grid_x, -1)
+        pred_boxes_y    = torch.unsqueeze(y * 2. - 0.5 + grid_y, -1)
+        pred_boxes_w    = torch.unsqueeze((w * 2) ** 2 * anchor_w, -1)
+        pred_boxes_h    = torch.unsqueeze((h * 2) ** 2 * anchor_h, -1)
+        pred_boxes      = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
+        return pred_boxes
+
+def is_parallel(model):
+    # Returns True if model is of type DP or DDP
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+def de_parallel(model):
+    # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    return model.module if is_parallel(model) else model
+    
+def copy_attr(a, b, include=(), exclude=()):
+    # Copy attributes from b to a, options to only include [...] and to exclude [...]
+    for k, v in b.__dict__.items():
+        if (len(include) and k not in include) or k.startswith('_') or k in exclude:
+            continue
+        else:
+            setattr(a, k, v)
+
+class ModelEMA:
+    """ Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models
+    Keeps a moving average of everything in the model state_dict (parameters and buffers)
+    For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    """
+
+    def __init__(self, model, decay=0.9999, tau=2000, updates=0):
+        # Create EMA
+        self.ema = deepcopy(de_parallel(model)).eval()  # FP32 EMA
+        # if next(model.parameters()).device.type != 'cpu':
+        #     self.ema.half()  # FP16 EMA
+        self.updates = updates  # number of EMA updates
+        self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = de_parallel(model).state_dict()  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+
+    def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
+        # Update EMA attributes
+        copy_attr(self.ema, model, include, exclude)
+
+def weights_init(net, init_type='normal', init_gain = 0.02):
+    def init_func(m):
+        classname = m.__class__.__name__
+        if hasattr(m, 'weight') and classname.find('Conv') != -1:
+            if init_type == 'normal':
+                torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
+            elif init_type == 'xavier':
+                torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
+            elif init_type == 'kaiming':
+                torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+            elif init_type == 'orthogonal':
+                torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
+            else:
+                raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
+        elif classname.find('BatchNorm2d') != -1:
+            torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
+            torch.nn.init.constant_(m.bias.data, 0.0)
+    print('initialize network with %s type' % init_type)
+    net.apply(init_func)
+
+def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.05, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.05, step_num = 10):
+    def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters):
+        if iters <= warmup_total_iters:
+            # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+            lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2
+            ) + warmup_lr_start
+        elif iters >= total_iters - no_aug_iter:
+            lr = min_lr
+        else:
+            lr = min_lr + 0.5 * (lr - min_lr) * (
+                1.0
+                + math.cos(
+                    math.pi
+                    * (iters - warmup_total_iters)
+                    / (total_iters - warmup_total_iters - no_aug_iter)
+                )
+            )
+        return lr
+
+    def step_lr(lr, decay_rate, step_size, iters):
+        if step_size < 1:
+            raise ValueError("step_size must above 1.")
+        n       = iters // step_size
+        out_lr  = lr * decay_rate ** n
+        return out_lr
+
+    if lr_decay_type == "cos":
+        warmup_total_iters  = min(max(warmup_iters_ratio * total_iters, 1), 3)
+        warmup_lr_start     = max(warmup_lr_ratio * lr, 1e-6)
+        no_aug_iter         = min(max(no_aug_iter_ratio * total_iters, 1), 15)
+        func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter)
+    else:
+        decay_rate  = (min_lr / lr) ** (1 / (step_num - 1))
+        step_size   = total_iters / step_num
+        func = partial(step_lr, lr, decay_rate, step_size)
+
+    return func
+
+def set_optimizer_lr(optimizer, lr_scheduler_func, epoch):
+    lr = lr_scheduler_func(epoch)
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+matplotlib==3.9.0
+numpy==1.24.3
+opencv_python==4.10.0.82
+Pillow==10.3.0
+pycocotools==2.0.7
+scipy==1.13.1
+torch==2.3.0
+torchvision==0.18.0
+tqdm==4.66.4
--- a/test.py
+++ b/test.py
--- a/train.py
+++ b/train.py
@@ -0,0 +1,560 @@
+#-------------------------------------#
+#       对数据集进行训练
+#-------------------------------------#
+import datetime
+import os
+from functools import partial
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+from network.yolo import YoloBody
+from network.yolo_training import (ModelEMA, YOLOLoss, get_lr_scheduler,
+                                set_optimizer_lr, weights_init)
+from utils.callbacks import EvalCallback, LossHistory
+from utils.dataloader import YoloDataset, yolo_dataset_collate
+from utils.utils import (download_weights, get_anchors, get_classes,
+                         seed_everything, show_config, worker_init_fn)
+from utils.utils_fit import fit_one_epoch
+
+import configparser
+
+if __name__ == "__main__":
+    conf=configparser.ConfigParser()
+    conf.read('config.ini',encoding='utf-8')
+    #---------------------------------#
+    #   Cuda    是否使用CudaTrue
+    #           没有GPU可以设置成False
+    #---------------------------------#
+    Cuda            = conf.getboolean('Train', 'Cuda')
+    #----------------------------------------------#
+    #   Seed    用于固定随机种子
+    #           使得每次独立训练都可以获得一样的结果
+    #----------------------------------------------#
+    seed            = conf.getint('Train', 'seed')
+    #---------------------------------------------------------------------#
+    #   distributed     用于指定是否使用单机多卡分布式运行
+    #                   终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
+    #                   Windows系统下默认使用DP模式调用所有显卡，不支持DDP。
+    #   DP模式：
+    #       设置            distributed = False
+    #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python train.py
+    #   DDP模式：
+    #       设置            distributed = True
+    #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
+    #---------------------------------------------------------------------#
+    distributed     = conf.getboolean('Train', 'distributed')
+    #---------------------------------------------------------------------#
+    #   sync_bn     是否使用sync_bn，DDP模式多卡可用
+    #---------------------------------------------------------------------#
+    sync_bn         = conf.getboolean('Train', 'sync_bn')
+    #---------------------------------------------------------------------#
+    #   fp16        是否使用混合精度训练
+    #               可减少约一半的显存、需要pytorch1.7.1以上
+    #---------------------------------------------------------------------#
+    fp16            = conf.getboolean('Train', 'fp16')
+    #---------------------------------------------------------------------#
+    #   classes_path    指向model_data下的txt，与自己训练的数据集相关 
+    #                   训练前一定要修改classes_path，使其对应自己的数据集
+    #---------------------------------------------------------------------#
+    classes_path    = conf.get('Train', 'classes_path')
+    #---------------------------------------------------------------------#
+    #   anchors_path    代表先验框对应的txt文件，一般不修改。
+    #   anchors_mask    用于帮助代码找到对应的先验框，一般不修改。
+    #---------------------------------------------------------------------#
+    anchors_path    = conf.get('Train', 'anchors_path')
+    anchors_mask    = eval(conf.get('Train', 'anchors_mask'))
+    #----------------------------------------------------------------------------------------------------------------------------#
+    #   权值文件的下载请看README，可以通过网盘下载。模型的 预训练权重 对不同数据集是通用的，因为特征是通用的。
+    #   模型的 预训练权重 比较重要的部分是 主干特征提取网络的权值部分，用于进行特征提取。
+    #   预训练权重对于99%的情况都必须要用，不用的话主干部分的权值太过随机，特征提取效果不明显，网络训练的结果也不会好
+    #
+    #   如果训练过程中存在中断训练的操作，可以将model_path设置成logs文件夹下的权值文件，将已经训练了一部分的权值再次载入。
+    #   同时修改下方的 冻结阶段 或者 解冻阶段 的参数，来保证模型epoch的连续性。
+    #   
+    #   当model_path = ''的时候不加载整个模型的权值。
+    #
+    #   此处使用的是整个模型的权重，因此是在train.py进行加载的。
+    #   如果想要让模型从0开始训练，则设置model_path = ''，下面的Freeze_Train = Fasle，此时从0开始训练，且没有冻结主干的过程。
+    #   
+    #   一般来讲，网络从0开始的训练效果会很差，因为权值太过随机，特征提取效果不明显，因此非常、非常、非常不建议大家从0开始训练！
+    #   从0开始训练有两个方案：
+    #   1、得益于Mosaic数据增强方法强大的数据增强能力，将UnFreeze_Epoch设置的较大（300及以上）、batch较大（16及以上）、数据较多（万以上）的情况下，
+    #      可以设置mosaic=True，直接随机初始化参数开始训练，但得到的效果仍然不如有预训练的情况。（像COCO这样的大数据集可以这样做）
+    #   2、了解imagenet数据集，首先训练分类模型，获得网络的主干部分权值，分类模型的 主干部分 和该模型通用，基于此进行训练。
+    #--------------------------------------------------------r--------------------------------------------------------------------#
+    model_path      = conf.get('Train', 'model_path')
+    #------------------------------------------------------#
+    #   input_shape     输入的shape大小，一定要是32的倍数
+    #------------------------------------------------------#
+    input_shape     = eval(conf.get('Train', 'input_shape'))
+    #------------------------------------------------------#
+    #   backbone        cspdarknet（默认）
+    #                   convnext_tiny
+    #                   convnext_small
+    #                   swin_transfomer_tiny
+    #------------------------------------------------------#
+    backbone        = conf.get('Train', 'backbone')
+    #----------------------------------------------------------------------------------------------------------------------------#
+    #   pretrained      是否使用主干网络的预训练权重，此处使用的是主干的权重，因此是在模型构建的时候进行加载的。
+    #                   如果设置了model_path，则主干的权值无需加载，pretrained的值无意义。
+    #                   如果不设置model_path，pretrained = True，此时仅加载主干开始训练。
+    #                   如果不设置model_path，pretrained = False，Freeze_Train = Fasle，此时从0开始训练，且没有冻结主干的过程。
+    #----------------------------------------------------------------------------------------------------------------------------#
+    pretrained      = conf.getboolean('Train', 'pretrained')
+    #------------------------------------------------------#
+    #   phi             所使用的YoloV5的版本。s、m、l、x
+    #                   在除cspdarknet的其它主干中仅影响panet的大小
+    #------------------------------------------------------#
+    phi             = conf.get('Train', 'phi')
+    # ------------------------------------------------------------------#
+    #   mosaic              马赛克数据增强。
+    #   mosaic_prob         每个step有多少概率使用mosaic数据增强，默认50%。
+    
+    #   mixup               是否使用mixup数据增强，仅在mosaic=True时有效。
+    #                       只会对mosaic增强后的图片进行mixup的处理。
+    #   mixup_prob          有多少概率在mosaic后使用mixup数据增强，默认50%。
+    #                       总的mixup概率为mosaic_prob * mixup_prob。
+    
+    #   special_aug_ratio   参考YoloX，由于Mosaic生成的训练图片，远远脱离自然图片的真实分布。
+    #                       当mosaic=True时，本代码会在special_aug_ratio范围内开启mosaic。
+    #                       默认为前70%个epoch，100个世代会开启70个世代。
+    # ------------------------------------------------------------------#
+    mosaic              = conf.getboolean('Train', 'mosaic')
+    mosaic_prob         = conf.getfloat('Train', 'mosaic_prob')
+    mixup               = conf.getboolean('Train', 'mixup')
+    mixup_prob          = conf.getfloat('Train', 'mixup_prob')
+    special_aug_ratio   = conf.getfloat('Train', 'special_aug_ratio')
+    #------------------------------------------------------------------#
+    #   label_smoothing     标签平滑。一般0.01以下。如0.01、0.005。
+    #------------------------------------------------------------------#
+    label_smoothing     = conf.getfloat('Train', 'label_smoothing')
+
+    #----------------------------------------------------------------------------------------------------------------------------#
+    #   训练分为两个阶段，分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。
+    #   冻结训练需要的显存较小，显卡非常差的情况下，可设置Freeze_Epoch等于UnFreeze_Epoch，Freeze_Train = True，此时仅仅进行冻结训练。
+    #      
+    #   在此提供若干参数设置建议，各位训练者根据自己的需求进行灵活调整：
+    #   （一）从整个模型的预训练权重开始训练： 
+    #       Adam：
+    #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 100，Freeze_Train = True，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（冻结）
+    #           Init_Epoch = 0，UnFreeze_Epoch = 100，Freeze_Train = False，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（不冻结）
+    #       SGD：
+    #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 300，Freeze_Train = True，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（冻结）
+    #           Init_Epoch = 0，UnFreeze_Epoch = 300，Freeze_Train = False，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（不冻结）
+    #       其中：UnFreeze_Epoch可以在100-300之间调整。
+    #   （二）从0开始训练：
+    #       Init_Epoch = 0，UnFreeze_Epoch >= 300，Unfreeze_batch_size >= 16，Freeze_Train = False（不冻结训练）
+    #       其中：UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd'，Init_lr = 1e-2，mosaic = True。
+    #   （三）batch_size的设置：
+    #       在显卡能够接受的范围内，以大为好。显存不足与数据集大小无关，提示显存不足（OOM或者CUDA out of memory）请调小batch_size。
+    #       受到BatchNorm层影响，batch_size最小为2，不能为1。
+    #       正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大，因为关系到学习率的自动调整。
+    #----------------------------------------------------------------------------------------------------------------------------#
+    #------------------------------------------------------------------#
+    #   冻结阶段训练参数
+    #   此时模型的主干被冻结了，特征提取网络不发生改变
+    #   占用的显存较小，仅对网络进行微调
+    #   Init_Epoch          模型当前开始的训练世代，其值可以大于Freeze_Epoch，如设置：
+    #                       Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100
+    #                       会跳过冻结阶段，直接从60代开始，并调整对应的学习率。
+    #                       （断点续练时使用）
+    #   Freeze_Epoch        模型冻结训练的Freeze_Epoch
+    #                       (当Freeze_Train=False时失效)
+    #   Freeze_batch_size   模型冻结训练的batch_size
+    #                       (当Freeze_Train=False时失效)
+    #------------------------------------------------------------------#
+    Init_Epoch          = 0
+    Freeze_Epoch        = 50
+    Freeze_batch_size   = 10
+    #------------------------------------------------------------------#
+    #   解冻阶段训练参数
+    #   此时模型的主干不被冻结了，特征提取网络会发生改变
+    #   占用的显存较大，网络所有的参数都会发生改变
+    #   UnFreeze_Epoch          模型总共训练的epoch
+    #                           SGD需要更长的时间收敛，因此设置较大的UnFreeze_Epoch
+    #                           Adam可以使用相对较小的UnFreeze_Epoch
+    #   Unfreeze_batch_size     模型在解冻后的batch_size
+    #------------------------------------------------------------------#
+    UnFreeze_Epoch      = 500
+    Unfreeze_batch_size = 6
+    #------------------------------------------------------------------#
+    #   Freeze_Train    是否进行冻结训练
+    #                   默认先冻结主干训练后解冻训练。
+    #------------------------------------------------------------------#
+    Freeze_Train        = True
+
+    #------------------------------------------------------------------#
+    #   其它训练参数：学习率、优化器、学习率下降有关
+    #------------------------------------------------------------------#
+    #------------------------------------------------------------------#
+    #   Init_lr         模型的最大学习率
+    #   Min_lr          模型的最小学习率，默认为最大学习率的0.01
+    #------------------------------------------------------------------#
+    Init_lr             = 1e-2
+    Min_lr              = Init_lr * 0.01
+    #------------------------------------------------------------------#
+    #   optimizer_type  使用到的优化器种类，可选的有adam、sgd
+    #                   当使用Adam优化器时建议设置  Init_lr=1e-3
+    #                   当使用SGD优化器时建议设置   Init_lr=1e-2
+    #   momentum        优化器内部使用到的momentum参数
+    #   weight_decay    权值衰减，可防止过拟合
+    #                   adam会导致weight_decay错误，使用adam时建议设置为0。
+    #------------------------------------------------------------------#
+    optimizer_type      = "sgd"
+    momentum            = 0.937
+    weight_decay        = 5e-4
+    #------------------------------------------------------------------#
+    #   lr_decay_type   使用到的学习率下降方式，可选的有step、cos
+    #------------------------------------------------------------------#
+    lr_decay_type       = "cos"
+    #------------------------------------------------------------------#
+    #   save_period     多少个epoch保存一次权值
+    #------------------------------------------------------------------#
+    save_period         = 10
+    #------------------------------------------------------------------#
+    #   save_dir        权值与日志文件保存的文件夹
+    #------------------------------------------------------------------#
+    save_dir            = 'logs'
+    #------------------------------------------------------------------#
+    #   eval_flag       是否在训练时进行评估，评估对象为验证集
+    #                   安装pycocotools库后，评估体验更佳。
+    #   eval_period     代表多少个epoch评估一次，不建议频繁的评估
+    #                   评估需要消耗较多的时间，频繁评估会导致训练非常慢
+    #   此处获得的mAP会与get_map.py获得的会有所不同，原因有二：
+    #   （一）此处获得的mAP为验证集的mAP。
+    #   （二）此处设置评估参数较为保守，目的是加快评估速度。
+    #------------------------------------------------------------------#
+    eval_flag           = True
+    eval_period         = 10
+    #------------------------------------------------------------------#
+    #   num_workers     用于设置是否使用多线程读取数据
+    #                   开启后会加快数据读取速度，但是会占用更多内存
+    #                   内存较小的电脑可以设置为2或者0  
+    #------------------------------------------------------------------#
+    num_workers         = 20
+
+    #------------------------------------------------------#
+    #   train_annotation_path   训练图片路径和标签
+    #   val_annotation_path     验证图片路径和标签
+    #------------------------------------------------------#
+    train_annotation_path   = '2007_train.txt'
+    val_annotation_path     = '2007_val.txt'
+
+    seed_everything(seed)
+    #------------------------------------------------------#
+    #   设置用到的显卡
+    #------------------------------------------------------#
+    ngpus_per_node  = torch.cuda.device_count()
+    if distributed:
+        dist.init_process_group(backend="nccl")
+        local_rank  = int(os.environ["LOCAL_RANK"])
+        rank        = int(os.environ["RANK"])
+        device      = torch.device("cuda", local_rank)
+        if local_rank == 0:
+            print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
+            print("Gpu Device Count : ", ngpus_per_node)
+    else:
+        device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        local_rank      = 0
+        rank            = 0
+
+    #------------------------------------------------------#
+    #   获取classes和anchor
+    #------------------------------------------------------#
+    class_names, num_classes = get_classes(classes_path)
+    anchors, num_anchors     = get_anchors(anchors_path)
+
+    #----------------------------------------------------#
+    #   下载预训练权重
+    #----------------------------------------------------#
+    if pretrained:
+        if distributed:
+            if local_rank == 0:
+                download_weights(backbone, phi)  
+            dist.barrier()
+        else:
+            download_weights(backbone, phi)
+
+    #------------------------------------------------------#
+    #   创建yolo模型
+    #------------------------------------------------------#
+    model = YoloBody(anchors_mask, num_classes, phi, backbone, pretrained=pretrained, input_shape=input_shape)
+    if not pretrained:
+        weights_init(model)
+    if model_path != '':
+        if local_rank == 0:
+            print('Load weights {}.'.format(model_path))
+        #------------------------------------------------------#
+        #   根据预训练权重的Key和模型的Key进行加载
+        #------------------------------------------------------#
+        model_dict      = model.state_dict()
+        pretrained_dict = torch.load(model_path, map_location = device)
+        load_key, no_load_key, temp_dict = [], [], {}
+        for k, v in pretrained_dict.items():
+            if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
+                temp_dict[k] = v
+                load_key.append(k)
+            else:
+                no_load_key.append(k)
+        model_dict.update(temp_dict)
+        model.load_state_dict(model_dict)
+        #------------------------------------------------------#
+        #   显示没有匹配上的Key
+        #------------------------------------------------------#
+        if local_rank == 0:
+            print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key))
+            print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key))
+            print("\n\033[1;33;44m温馨提示，head部分没有载入是正常现象，Backbone部分没有载入是错误的。\033[0m")
+
+    #----------------------#
+    #   获得损失函数
+    #----------------------#
+    yolo_loss    = YOLOLoss(anchors, num_classes, input_shape, Cuda, anchors_mask, label_smoothing)
+    #----------------------#
+    #   记录Loss
+    #----------------------#
+    if local_rank == 0:
+        time_str        = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S')
+        log_dir         = os.path.join(save_dir, "loss_" + str(time_str))
+        loss_history    = LossHistory(log_dir, model, input_shape=input_shape)
+    else:
+        loss_history    = None
+        
+    #------------------------------------------------------------------#
+    #   torch 1.2不支持amp，建议使用torch 1.7.1及以上正确使用fp16
+    #   因此torch1.2这里显示"could not be resolve"
+    #------------------------------------------------------------------#
+    if fp16:
+        from torch.cuda.amp import GradScaler as GradScaler
+        scaler = GradScaler()
+    else:
+        scaler = None
+
+    model_train     = model.train()
+    #----------------------------#
+    #   多卡同步Bn
+    #----------------------------#
+    if sync_bn and ngpus_per_node > 1 and distributed:
+        model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train)
+    elif sync_bn:
+        print("Sync_bn is not support in one gpu or not distributed.")
+
+    if Cuda:
+        if distributed:
+            #----------------------------#
+            #   多卡平行运行
+            #----------------------------#
+            model_train = model_train.cuda(local_rank)
+            model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True)
+        else:
+            model_train = torch.nn.DataParallel(model)
+            cudnn.benchmark = True
+            model_train = model_train.cuda()
+            
+    #----------------------------#
+    #   权值平滑
+    #----------------------------#
+    ema = ModelEMA(model_train)
+    
+    #---------------------------#
+    #   读取数据集对应的txt
+    #---------------------------#
+    with open(train_annotation_path, encoding='utf-8') as f:
+        train_lines = f.readlines()
+    with open(val_annotation_path, encoding='utf-8') as f:
+        val_lines   = f.readlines()
+    num_train   = len(train_lines)
+    num_val     = len(val_lines)
+
+    if local_rank == 0:
+        show_config(
+            classes_path = classes_path, anchors_path = anchors_path, anchors_mask = anchors_mask, model_path = model_path, input_shape = input_shape, \
+            Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \
+            Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
+            save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val
+        )
+        #---------------------------------------------------------#
+        #   总训练世代指的是遍历全部数据的总次数
+        #   总训练步长指的是梯度下降的总次数 
+        #   每个训练世代包含若干训练步长，每个训练步长进行一次梯度下降。
+        #   此处仅建议最低训练世代，上不封顶，计算时只考虑了解冻部分
+        #----------------------------------------------------------#
+        wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
+        total_step  = num_train // Unfreeze_batch_size * UnFreeze_Epoch
+        if total_step <= wanted_step:
+            if num_train // Unfreeze_batch_size == 0:
+                raise ValueError('数据集过小，无法进行训练，请扩充数据集。')
+            wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
+            print("\n\033[1;33;44m[Warning] 使用%s优化器时，建议将训练总步长设置到%d以上。\033[0m"%(optimizer_type, wanted_step))
+            print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%d，Unfreeze_batch_size为%d，共训练%d个Epoch，计算出总训练步长为%d。\033[0m"%(num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step))
+            print("\033[1;33;44m[Warning] 由于总训练步长为%d，小于建议总步长%d，建议设置总世代为%d。\033[0m"%(total_step, wanted_step, wanted_epoch))
+
+    #------------------------------------------------------#
+    #   主干特征提取网络特征通用，冻结训练可以加快训练速度
+    #   也可以在训练初期防止权值被破坏。
+    #   Init_Epoch为起始世代
+    #   Freeze_Epoch为冻结训练的世代
+    #   UnFreeze_Epoch总训练世代
+    #   提示OOM或者显存不足请调小Batch_size
+    #------------------------------------------------------#
+    if True:
+        UnFreeze_flag = False
+        #------------------------------------#
+        #   冻结一定部分训练
+        #------------------------------------#
+        if Freeze_Train:
+            for param in model.backbone.parameters():
+                param.requires_grad = False
+
+        #-------------------------------------------------------------------#
+        #   如果不冻结训练的话，直接设置batch_size为Unfreeze_batch_size
+        #-------------------------------------------------------------------#
+        batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size
+
+        #-------------------------------------------------------------------#
+        #   判断当前batch_size，自适应调整学习率
+        #-------------------------------------------------------------------#
+        nbs             = 64
+        lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
+        lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
+        Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
+        Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
+
+        #---------------------------------------#
+        #   根据optimizer_type选择优化器
+        #---------------------------------------#
+        pg0, pg1, pg2 = [], [], []  
+        for k, v in model.named_modules():
+            if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                pg2.append(v.bias)    
+            if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                pg0.append(v.weight)    
+            elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                pg1.append(v.weight)   
+        optimizer = {
+            'adam'  : optim.Adam(pg0, Init_lr_fit, betas = (momentum, 0.999)),
+            'sgd'   : optim.SGD(pg0, Init_lr_fit, momentum = momentum, nesterov=True)
+        }[optimizer_type]
+        optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay})
+        optimizer.add_param_group({"params": pg2})
+
+        #---------------------------------------#
+        #   获得学习率下降的公式
+        #---------------------------------------#
+        lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
+        
+        #---------------------------------------#
+        #   判断每一个世代的长度
+        #---------------------------------------#
+        epoch_step      = num_train // batch_size
+        epoch_step_val  = num_val // batch_size
+        
+        if epoch_step == 0 or epoch_step_val == 0:
+            raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
+
+        if ema:
+            ema.updates     = epoch_step * Init_Epoch
+        
+        #---------------------------------------#
+        #   构建数据集加载器。
+        #---------------------------------------#
+        train_dataset   = YoloDataset(train_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
+                                        mosaic=mosaic, mixup=mixup, mosaic_prob=mosaic_prob, mixup_prob=mixup_prob, train=True, special_aug_ratio=special_aug_ratio)
+        val_dataset     = YoloDataset(val_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
+                                        mosaic=False, mixup=False, mosaic_prob=0, mixup_prob=0, train=False, special_aug_ratio=0)
+        
+        if distributed:
+            train_sampler   = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,)
+            val_sampler     = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False,)
+            batch_size      = batch_size // ngpus_per_node
+            shuffle         = False
+        else:
+            train_sampler   = None
+            val_sampler     = None
+            shuffle         = True
+
+        gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
+                                    drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
+                                    worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
+        gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
+                                    drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
+                                    worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
+
+        #----------------------#
+        #   记录eval的map曲线
+        #----------------------#
+        if local_rank == 0:
+            eval_callback   = EvalCallback(model, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, Cuda, \
+                                            eval_flag=eval_flag, period=eval_period)
+        else:
+            eval_callback   = None
+        
+        #---------------------------------------#
+        #   开始模型训练
+        #---------------------------------------#
+        for epoch in range(Init_Epoch, UnFreeze_Epoch):
+            #---------------------------------------#
+            #   如果模型有冻结学习部分
+            #   则解冻，并设置参数
+            #---------------------------------------#
+            if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
+                batch_size = Unfreeze_batch_size
+
+                #-------------------------------------------------------------------#
+                #   判断当前batch_size，自适应调整学习率
+                #-------------------------------------------------------------------#
+                nbs             = 64
+                lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
+                lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
+                Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
+                Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
+                #---------------------------------------#
+                #   获得学习率下降的公式
+                #---------------------------------------#
+                lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
+
+                for param in model.backbone.parameters():
+                    param.requires_grad = True
+
+                epoch_step      = num_train // batch_size
+                epoch_step_val  = num_val // batch_size
+
+                if epoch_step == 0 or epoch_step_val == 0:
+                    raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
+                    
+                if ema:
+                    ema.updates     = epoch_step * epoch
+
+                if distributed:
+                    batch_size  = batch_size // ngpus_per_node
+                    
+                gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
+                                            drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
+                                            worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
+                gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
+                                            drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
+                                            worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
+
+                UnFreeze_flag   = True
+
+            gen.dataset.epoch_now       = epoch
+            gen_val.dataset.epoch_now   = epoch
+
+            if distributed:
+                train_sampler.set_epoch(epoch)
+
+            set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
+
+            fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank)
+            
+            if distributed:
+                dist.barrier()
+
+        if local_rank == 0:
+            loss_history.writer.close()
--- a/utils/pycache/callbacks.cpython-310.pyc
+++ b/utils/pycache/callbacks.cpython-310.pyc
--- a/utils/pycache/dataloader.cpython-310.pyc
+++ b/utils/pycache/dataloader.cpython-310.pyc
--- a/utils/pycache/utils.cpython-310.pyc
+++ b/utils/pycache/utils.cpython-310.pyc
--- a/utils/pycache/utils_bbox.cpython-310.pyc
+++ b/utils/pycache/utils_bbox.cpython-310.pyc
--- a/utils/pycache/utils_fit.cpython-310.pyc
+++ b/utils/pycache/utils_fit.cpython-310.pyc
--- a/utils/pycache/utils_map.cpython-310.pyc
+++ b/utils/pycache/utils_map.cpython-310.pyc
--- a/utils/callbacks.py
+++ b/utils/callbacks.py
@@ -0,0 +1,232 @@
+import datetime
+import os
+
+import torch
+import matplotlib
+matplotlib.use('Agg')
+import scipy.signal
+from matplotlib import pyplot as plt
+from torch.utils.tensorboard import SummaryWriter
+
+import shutil
+import numpy as np
+
+from PIL import Image
+from tqdm import tqdm
+from .utils import cvtColor, preprocess_input, resize_image
+from .utils_bbox import DecodeBox
+from .utils_map import get_coco_map, get_map
+
+
+class LossHistory():
+    def __init__(self, log_dir, model, input_shape):
+        self.log_dir    = log_dir
+        self.losses     = []
+        self.val_loss   = []
+        
+        os.makedirs(self.log_dir)
+        self.writer     = SummaryWriter(self.log_dir)
+        try:
+            dummy_input     = torch.randn(2, 3, input_shape[0], input_shape[1])
+            self.writer.add_graph(model, dummy_input)
+        except:
+            pass
+
+    def append_loss(self, epoch, loss, val_loss):
+        if not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+
+        self.losses.append(loss)
+        self.val_loss.append(val_loss)
+
+        with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f:
+            f.write(str(loss))
+            f.write("\n")
+        with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f:
+            f.write(str(val_loss))
+            f.write("\n")
+
+        self.writer.add_scalar('loss', loss, epoch)
+        self.writer.add_scalar('val_loss', val_loss, epoch)
+        self.loss_plot()
+
+    def loss_plot(self):
+        iters = range(len(self.losses))
+
+        plt.figure()
+        plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss')
+        plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss')
+        try:
+            if len(self.losses) < 25:
+                num = 5
+            else:
+                num = 15
+            
+            plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss')
+            plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss')
+        except:
+            pass
+
+        plt.grid(True)
+        plt.xlabel('Epoch')
+        plt.ylabel('Loss')
+        plt.legend(loc="upper right")
+
+        plt.savefig(os.path.join(self.log_dir, "epoch_loss.png"))
+
+        plt.cla()
+        plt.close("all")
+
+class EvalCallback():
+    def __init__(self, net, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, cuda, \
+            map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True, MINOVERLAP=0.5, eval_flag=True, period=1):
+        super(EvalCallback, self).__init__()
+        
+        self.net                = net
+        self.input_shape        = input_shape
+        self.anchors            = anchors
+        self.anchors_mask       = anchors_mask
+        self.class_names        = class_names
+        self.num_classes        = num_classes
+        self.val_lines          = val_lines
+        self.log_dir            = log_dir
+        self.cuda               = cuda
+        self.map_out_path       = map_out_path
+        self.max_boxes          = max_boxes
+        self.confidence         = confidence
+        self.nms_iou            = nms_iou
+        self.letterbox_image    = letterbox_image
+        self.MINOVERLAP         = MINOVERLAP
+        self.eval_flag          = eval_flag
+        self.period             = period
+        
+        self.bbox_util          = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]), self.anchors_mask)
+        
+        self.maps       = [0]
+        self.epoches    = [0]
+        if self.eval_flag:
+            with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
+                f.write(str(0))
+                f.write("\n")
+
+    def get_map_txt(self, image_id, image, class_names, map_out_path):
+        f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8') 
+        image_shape = np.array(np.shape(image)[0:2])
+        #---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        #---------------------------------------------------------#
+        image       = cvtColor(image)
+        #---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        #---------------------------------------------------------#
+        image_data  = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        #---------------------------------------------------------#
+        #   添加上batch_size维度
+        #---------------------------------------------------------#
+        image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            #---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            #---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            #---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            #---------------------------------------------------------#
+            results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, 
+                        image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
+                                                    
+            if results[0] is None: 
+                return 
+
+            top_label   = np.array(results[0][:, 6], dtype = 'int32')
+            top_conf    = results[0][:, 4] * results[0][:, 5]
+            top_boxes   = results[0][:, :4]
+
+        top_100     = np.argsort(top_conf)[::-1][:self.max_boxes]
+        top_boxes   = top_boxes[top_100]
+        top_conf    = top_conf[top_100]
+        top_label   = top_label[top_100]
+
+        for i, c in list(enumerate(top_label)):
+            predicted_class = self.class_names[int(c)]
+            box             = top_boxes[i]
+            score           = str(top_conf[i])
+
+            top, left, bottom, right = box
+            if predicted_class not in class_names:
+                continue
+
+            f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom))))
+
+        f.close()
+        return 
+    
+    def on_epoch_end(self, epoch, model_eval):
+        if epoch % self.period == 0 and self.eval_flag:
+            self.net = model_eval
+            if not os.path.exists(self.map_out_path):
+                os.makedirs(self.map_out_path)
+            if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")):
+                os.makedirs(os.path.join(self.map_out_path, "ground-truth"))
+            if not os.path.exists(os.path.join(self.map_out_path, "detection-results")):
+                os.makedirs(os.path.join(self.map_out_path, "detection-results"))
+            print("Get map.")
+            for annotation_line in tqdm(self.val_lines):
+                line        = annotation_line.split()
+                image_id    = os.path.basename(line[0]).split('.')[0]
+                #------------------------------#
+                #   读取图像并转换成RGB图像
+                #------------------------------#
+                image       = Image.open(line[0])
+                #------------------------------#
+                #   获得预测框
+                #------------------------------#
+                gt_boxes    = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
+                #------------------------------#
+                #   获得预测txt
+                #------------------------------#
+                self.get_map_txt(image_id, image, self.class_names, self.map_out_path)
+                
+                #------------------------------#
+                #   获得真实框txt
+                #------------------------------#
+                with open(os.path.join(self.map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f:
+                    for box in gt_boxes:
+                        left, top, right, bottom, obj = box
+                        obj_name = self.class_names[obj]
+                        new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
+                        
+            print("Calculate Map.")
+            try:
+                temp_map = get_coco_map(class_names = self.class_names, path = self.map_out_path)[1]
+            except:
+                temp_map = get_map(self.MINOVERLAP, False, path = self.map_out_path)
+            self.maps.append(temp_map)
+            self.epoches.append(epoch)
+
+            with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
+                f.write(str(temp_map))
+                f.write("\n")
+            
+            plt.figure()
+            plt.plot(self.epoches, self.maps, 'red', linewidth = 2, label='train map')
+
+            plt.grid(True)
+            plt.xlabel('Epoch')
+            plt.ylabel('Map %s'%str(self.MINOVERLAP))
+            plt.title('A Map Curve')
+            plt.legend(loc="upper right")
+
+            plt.savefig(os.path.join(self.log_dir, "epoch_map.png"))
+            plt.cla()
+            plt.close("all")
+
+            print("Get map done.")
+            shutil.rmtree(self.map_out_path)
--- a/utils/dataloader.py
+++ b/utils/dataloader.py
@@ -0,0 +1,504 @@
+from random import sample, shuffle
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data.dataset import Dataset
+
+from utils.utils import cvtColor, preprocess_input
+
+
+class YoloDataset(Dataset):
+    def __init__(self, annotation_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length, \
+                        mosaic, mixup, mosaic_prob, mixup_prob, train, special_aug_ratio = 0.7):
+        super(YoloDataset, self).__init__()
+        self.annotation_lines   = annotation_lines
+        self.input_shape        = input_shape
+        self.num_classes        = num_classes
+        self.anchors            = anchors
+        self.anchors_mask       = anchors_mask
+        self.epoch_length       = epoch_length
+        self.mosaic             = mosaic
+        self.mosaic_prob        = mosaic_prob
+        self.mixup              = mixup
+        self.mixup_prob         = mixup_prob
+        self.train              = train
+        self.special_aug_ratio  = special_aug_ratio
+
+        self.epoch_now          = -1
+        self.length             = len(self.annotation_lines)
+        
+        self.bbox_attrs         = 5 + num_classes
+        self.threshold          = 4
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        index       = index % self.length
+
+        #---------------------------------------------------#
+        #   训练时进行数据的随机增强
+        #   验证时不进行数据的随机增强
+        #---------------------------------------------------#
+        if self.mosaic and self.rand() < self.mosaic_prob and self.epoch_now < self.epoch_length * self.special_aug_ratio:
+            lines = sample(self.annotation_lines, 3)
+            lines.append(self.annotation_lines[index])
+            shuffle(lines)
+            image, box  = self.get_random_data_with_Mosaic(lines, self.input_shape)
+            
+            if self.mixup and self.rand() < self.mixup_prob:
+                lines           = sample(self.annotation_lines, 1)
+                image_2, box_2  = self.get_random_data(lines[0], self.input_shape, random = self.train)
+                image, box      = self.get_random_data_with_MixUp(image, box, image_2, box_2)
+        else:
+            image, box      = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train)
+
+        image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
+        box         = np.array(box, dtype=np.float32)
+        if len(box) != 0:
+            #---------------------------------------------------#
+            #   对真实框进行归一化，调整到0-1之间
+            #---------------------------------------------------#
+            box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
+            box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
+            #---------------------------------------------------#
+            #   序号为0、1的部分，为真实框的中心
+            #   序号为2、3的部分，为真实框的宽高
+            #   序号为4的部分，为真实框的种类
+            #---------------------------------------------------#
+            box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
+            box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
+        y_true = self.get_target(box)
+        return image, box, y_true
+
+    def rand(self, a=0, b=1):
+        return np.random.rand()*(b-a) + a
+
+    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
+        line    = annotation_line.split()
+        #------------------------------#
+        #   读取图像并转换成RGB图像
+        #------------------------------#
+        image   = Image.open(line[0])
+        image   = cvtColor(image)
+        #------------------------------#
+        #   获得图像的高宽与目标高宽
+        #------------------------------#
+        iw, ih  = image.size
+        h, w    = input_shape
+        #------------------------------#
+        #   获得预测框
+        #------------------------------#
+        box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
+
+        if not random:
+            scale = min(w/iw, h/ih)
+            nw = int(iw*scale)
+            nh = int(ih*scale)
+            dx = (w-nw)//2
+            dy = (h-nh)//2
+
+            #---------------------------------#
+            #   将图像多余的部分加上灰条
+            #---------------------------------#
+            image       = image.resize((nw,nh), Image.BICUBIC)
+            new_image   = Image.new('RGB', (w,h), (128,128,128))
+            new_image.paste(image, (dx, dy))
+            image_data  = np.array(new_image, np.float32)
+
+            #---------------------------------#
+            #   对真实框进行调整
+            #---------------------------------#
+            if len(box)>0:
+                np.random.shuffle(box)
+                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+                box[:, 0:2][box[:, 0:2]<0] = 0
+                box[:, 2][box[:, 2]>w] = w
+                box[:, 3][box[:, 3]>h] = h
+                box_w = box[:, 2] - box[:, 0]
+                box_h = box[:, 3] - box[:, 1]
+                box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
+
+            return image_data, box
+                
+        #------------------------------------------#
+        #   对图像进行缩放并且进行长和宽的扭曲
+        #------------------------------------------#
+        new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
+        scale = self.rand(.25, 2)
+        if new_ar < 1:
+            nh = int(scale*h)
+            nw = int(nh*new_ar)
+        else:
+            nw = int(scale*w)
+            nh = int(nw/new_ar)
+        image = image.resize((nw,nh), Image.BICUBIC)
+
+        #------------------------------------------#
+        #   将图像多余的部分加上灰条
+        #------------------------------------------#
+        dx = int(self.rand(0, w-nw))
+        dy = int(self.rand(0, h-nh))
+        new_image = Image.new('RGB', (w,h), (128,128,128))
+        new_image.paste(image, (dx, dy))
+        image = new_image
+
+        #------------------------------------------#
+        #   翻转图像
+        #------------------------------------------#
+        flip = self.rand()<.5
+        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
+
+        image_data      = np.array(image, np.uint8)
+        #---------------------------------#
+        #   对图像进行色域变换
+        #   计算色域变换的参数
+        #---------------------------------#
+        r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
+        #---------------------------------#
+        #   将图像转到HSV上
+        #---------------------------------#
+        hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
+        dtype           = image_data.dtype
+        #---------------------------------#
+        #   应用变换
+        #---------------------------------#
+        x       = np.arange(0, 256, dtype=r.dtype)
+        lut_hue = ((x * r[0]) % 180).astype(dtype)
+        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+        image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+        image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
+
+        #---------------------------------#
+        #   对真实框进行调整
+        #---------------------------------#
+        if len(box)>0:
+            np.random.shuffle(box)
+            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+            if flip: box[:, [0,2]] = w - box[:, [2,0]]
+            box[:, 0:2][box[:, 0:2]<0] = 0
+            box[:, 2][box[:, 2]>w] = w
+            box[:, 3][box[:, 3]>h] = h
+            box_w = box[:, 2] - box[:, 0]
+            box_h = box[:, 3] - box[:, 1]
+            box = box[np.logical_and(box_w>1, box_h>1)] 
+        
+        return image_data, box
+    
+    def merge_bboxes(self, bboxes, cutx, cuty):
+        merge_bbox = []
+        for i in range(len(bboxes)):
+            for box in bboxes[i]:
+                tmp_box = []
+                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+
+                if i == 0:
+                    if y1 > cuty or x1 > cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y2 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x2 = cutx
+
+                if i == 1:
+                    if y2 < cuty or x1 > cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y1 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x2 = cutx
+
+                if i == 2:
+                    if y2 < cuty or x2 < cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y1 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x1 = cutx
+
+                if i == 3:
+                    if y1 > cuty or x2 < cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y2 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x1 = cutx
+                tmp_box.append(x1)
+                tmp_box.append(y1)
+                tmp_box.append(x2)
+                tmp_box.append(y2)
+                tmp_box.append(box[-1])
+                merge_bbox.append(tmp_box)
+        return merge_bbox
+
+    def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4):
+        h, w = input_shape
+        min_offset_x = self.rand(0.3, 0.7)
+        min_offset_y = self.rand(0.3, 0.7)
+
+        image_datas = [] 
+        box_datas   = []
+        index       = 0
+        for line in annotation_line:
+            #---------------------------------#
+            #   每一行进行分割
+            #---------------------------------#
+            line_content = line.split()
+            #---------------------------------#
+            #   打开图片
+            #---------------------------------#
+            image = Image.open(line_content[0])
+            image = cvtColor(image)
+            
+            #---------------------------------#
+            #   图片的大小
+            #---------------------------------#
+            iw, ih = image.size
+            #---------------------------------#
+            #   保存框的位置
+            #---------------------------------#
+            box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
+            
+            #---------------------------------#
+            #   是否翻转图片
+            #---------------------------------#
+            flip = self.rand()<.5
+            if flip and len(box)>0:
+                image = image.transpose(Image.FLIP_LEFT_RIGHT)
+                box[:, [0,2]] = iw - box[:, [2,0]]
+
+            #------------------------------------------#
+            #   对图像进行缩放并且进行长和宽的扭曲
+            #------------------------------------------#
+            new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
+            scale = self.rand(.4, 1)
+            if new_ar < 1:
+                nh = int(scale*h)
+                nw = int(nh*new_ar)
+            else:
+                nw = int(scale*w)
+                nh = int(nw/new_ar)
+            image = image.resize((nw, nh), Image.BICUBIC)
+
+            #-----------------------------------------------#
+            #   将图片进行放置，分别对应四张分割图片的位置
+            #-----------------------------------------------#
+            if index == 0:
+                dx = int(w*min_offset_x) - nw
+                dy = int(h*min_offset_y) - nh
+            elif index == 1:
+                dx = int(w*min_offset_x) - nw
+                dy = int(h*min_offset_y)
+            elif index == 2:
+                dx = int(w*min_offset_x)
+                dy = int(h*min_offset_y)
+            elif index == 3:
+                dx = int(w*min_offset_x)
+                dy = int(h*min_offset_y) - nh
+            
+            new_image = Image.new('RGB', (w,h), (128,128,128))
+            new_image.paste(image, (dx, dy))
+            image_data = np.array(new_image)
+
+            index = index + 1
+            box_data = []
+            #---------------------------------#
+            #   对box进行重新处理
+            #---------------------------------#
+            if len(box)>0:
+                np.random.shuffle(box)
+                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+                box[:, 0:2][box[:, 0:2]<0] = 0
+                box[:, 2][box[:, 2]>w] = w
+                box[:, 3][box[:, 3]>h] = h
+                box_w = box[:, 2] - box[:, 0]
+                box_h = box[:, 3] - box[:, 1]
+                box = box[np.logical_and(box_w>1, box_h>1)]
+                box_data = np.zeros((len(box),5))
+                box_data[:len(box)] = box
+            
+            image_datas.append(image_data)
+            box_datas.append(box_data)
+
+        #---------------------------------#
+        #   将图片分割，放在一起
+        #---------------------------------#
+        cutx = int(w * min_offset_x)
+        cuty = int(h * min_offset_y)
+
+        new_image = np.zeros([h, w, 3])
+        new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
+        new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :]
+        new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
+        new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]
+
+        new_image       = np.array(new_image, np.uint8)
+        #---------------------------------#
+        #   对图像进行色域变换
+        #   计算色域变换的参数
+        #---------------------------------#
+        r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
+        #---------------------------------#
+        #   将图像转到HSV上
+        #---------------------------------#
+        hue, sat, val   = cv2.split(cv2.cvtColor(new_image, cv2.COLOR_RGB2HSV))
+        dtype           = new_image.dtype
+        #---------------------------------#
+        #   应用变换
+        #---------------------------------#
+        x       = np.arange(0, 256, dtype=r.dtype)
+        lut_hue = ((x * r[0]) % 180).astype(dtype)
+        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+        new_image = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+        new_image = cv2.cvtColor(new_image, cv2.COLOR_HSV2RGB)
+
+        #---------------------------------#
+        #   对框进行进一步的处理
+        #---------------------------------#
+        new_boxes = self.merge_bboxes(box_datas, cutx, cuty)
+
+        return new_image, new_boxes
+
+    def get_random_data_with_MixUp(self, image_1, box_1, image_2, box_2):
+        new_image = np.array(image_1, np.float32) * 0.5 + np.array(image_2, np.float32) * 0.5
+        if len(box_1) == 0:
+            new_boxes = box_2
+        elif len(box_2) == 0:
+            new_boxes = box_1
+        else:
+            new_boxes = np.concatenate([box_1, box_2], axis=0)
+        return new_image, new_boxes
+    
+    def get_near_points(self, x, y, i, j):
+        sub_x = x - i
+        sub_y = y - j
+        if sub_x > 0.5 and sub_y > 0.5:
+            return [[0, 0], [1, 0], [0, 1]]
+        elif sub_x < 0.5 and sub_y > 0.5:
+            return [[0, 0], [-1, 0], [0, 1]]
+        elif sub_x < 0.5 and sub_y < 0.5:
+            return [[0, 0], [-1, 0], [0, -1]]
+        else:
+            return [[0, 0], [1, 0], [0, -1]]
+
+    def get_target(self, targets):
+        #-----------------------------------------------------------#
+        #   一共有三个特征层数
+        #-----------------------------------------------------------#
+        num_layers  = len(self.anchors_mask)
+        
+        input_shape = np.array(self.input_shape, dtype='int32')
+        grid_shapes = [input_shape // {0:32, 1:16, 2:8, 3:4}[l] for l in range(num_layers)]
+        y_true      = [np.zeros((len(self.anchors_mask[l]), grid_shapes[l][0], grid_shapes[l][1], self.bbox_attrs), dtype='float32') for l in range(num_layers)]
+        box_best_ratio = [np.zeros((len(self.anchors_mask[l]), grid_shapes[l][0], grid_shapes[l][1]), dtype='float32') for l in range(num_layers)]
+        
+        if len(targets) == 0:
+            return y_true
+        
+        for l in range(num_layers):
+            in_h, in_w      = grid_shapes[l]
+            anchors         = np.array(self.anchors) / {0:32, 1:16, 2:8, 3:4}[l]
+            
+            batch_target = np.zeros_like(targets)
+            #-------------------------------------------------------#
+            #   计算出正样本在特征层上的中心点
+            #-------------------------------------------------------#
+            batch_target[:, [0,2]]  = targets[:, [0,2]] * in_w
+            batch_target[:, [1,3]]  = targets[:, [1,3]] * in_h
+            batch_target[:, 4]      = targets[:, 4]
+            #-------------------------------------------------------#
+            #   wh                          : num_true_box, 2
+            #   np.expand_dims(wh, 1)       : num_true_box, 1, 2
+            #   anchors                     : 9, 2
+            #   np.expand_dims(anchors, 0)  : 1, 9, 2
+            #   
+            #   ratios_of_gt_anchors代表每一个真实框和每一个先验框的宽高的比值
+            #   ratios_of_gt_anchors    : num_true_box, 9, 2
+            #   ratios_of_anchors_gt代表每一个先验框和每一个真实框的宽高的比值
+            #   ratios_of_anchors_gt    : num_true_box, 9, 2
+            #
+            #   ratios                  : num_true_box, 9, 4
+            #   max_ratios代表每一个真实框和每一个先验框的宽高的比值的最大值
+            #   max_ratios              : num_true_box, 9
+            #-------------------------------------------------------#
+            ratios_of_gt_anchors = np.expand_dims(batch_target[:, 2:4], 1) / np.expand_dims(anchors, 0)
+            ratios_of_anchors_gt = np.expand_dims(anchors, 0) / np.expand_dims(batch_target[:, 2:4], 1)
+            ratios               = np.concatenate([ratios_of_gt_anchors, ratios_of_anchors_gt], axis = -1)
+            max_ratios           = np.max(ratios, axis = -1)
+            
+            for t, ratio in enumerate(max_ratios):
+                #-------------------------------------------------------#
+                #   ratio : 9
+                #-------------------------------------------------------#
+                over_threshold = ratio < self.threshold
+                over_threshold[np.argmin(ratio)] = True
+                for k, mask in enumerate(self.anchors_mask[l]):
+                    if not over_threshold[mask]:
+                        continue
+                    #----------------------------------------#
+                    #   获得真实框属于哪个网格点
+                    #   x  1.25     => 1
+                    #   y  3.75     => 3
+                    #----------------------------------------#
+                    i = int(np.floor(batch_target[t, 0]))
+                    j = int(np.floor(batch_target[t, 1]))
+                    
+                    offsets = self.get_near_points(batch_target[t, 0], batch_target[t, 1], i, j)
+                    for offset in offsets:
+                        local_i = i + offset[0]
+                        local_j = j + offset[1]
+
+                        if local_i >= in_w or local_i < 0 or local_j >= in_h or local_j < 0:
+                            continue
+
+                        if box_best_ratio[l][k, local_j, local_i] != 0:
+                            if box_best_ratio[l][k, local_j, local_i] > ratio[mask]:
+                                y_true[l][k, local_j, local_i, :] = 0
+                            else:
+                                continue
+                            
+                        #----------------------------------------#
+                        #   取出真实框的种类
+                        #----------------------------------------#
+                        c = int(batch_target[t, 4])
+
+                        #----------------------------------------#
+                        #   tx、ty代表中心调整参数的真实值
+                        #----------------------------------------#
+                        y_true[l][k, local_j, local_i, 0] = batch_target[t, 0]
+                        y_true[l][k, local_j, local_i, 1] = batch_target[t, 1]
+                        y_true[l][k, local_j, local_i, 2] = batch_target[t, 2]
+                        y_true[l][k, local_j, local_i, 3] = batch_target[t, 3]
+                        y_true[l][k, local_j, local_i, 4] = 1
+                        y_true[l][k, local_j, local_i, c + 5] = 1
+                        #----------------------------------------#
+                        #   获得当前先验框最好的比例
+                        #----------------------------------------#
+                        box_best_ratio[l][k, local_j, local_i] = ratio[mask]
+                        
+        return y_true
+    
+# DataLoader中collate_fn使用
+def yolo_dataset_collate(batch):
+    images  = []
+    bboxes  = []
+    y_trues = [[] for _ in batch[0][2]]
+    for img, box, y_true in batch:
+        images.append(img)
+        bboxes.append(box)
+        for i, sub_y_true in enumerate(y_true):
+            y_trues[i].append(sub_y_true)
+            
+    images  = torch.from_numpy(np.array(images)).type(torch.FloatTensor)
+    bboxes  = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in bboxes]
+    y_trues = [torch.from_numpy(np.array(ann, np.float32)).type(torch.FloatTensor) for ann in y_trues]
+    return images, bboxes,y_trues
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -0,0 +1,117 @@
+import random
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+#---------------------------------------------------------#
+#   将图像转换成RGB图像，防止灰度图在预测时报错。
+#   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+#---------------------------------------------------------#
+def cvtColor(image):
+    if len(np.shape(image)) == 3 and np.shape(image)[2] == 3:
+        return image 
+    else:
+        image = image.convert('RGB')
+        return image 
+
+#---------------------------------------------------#
+#   对输入图像进行resize
+#---------------------------------------------------#
+def resize_image(image, size, letterbox_image):
+    iw, ih  = image.size
+    w, h    = size
+    if letterbox_image:
+        scale   = min(w/iw, h/ih)
+        nw      = int(iw*scale)
+        nh      = int(ih*scale)
+
+        image   = image.resize((nw,nh), Image.BICUBIC)
+        new_image = Image.new('RGB', size, (128,128,128))
+        new_image.paste(image, ((w-nw)//2, (h-nh)//2))
+    else:
+        new_image = image.resize((w, h), Image.BICUBIC)
+    return new_image
+
+#---------------------------------------------------#
+#   获得类
+#---------------------------------------------------#
+def get_classes(classes_path):
+    with open(classes_path, encoding='utf-8') as f:
+        class_names = f.readlines()
+    class_names = [c.strip() for c in class_names]
+    return class_names, len(class_names)
+
+#---------------------------------------------------#
+#   获得先验框
+#---------------------------------------------------#
+def get_anchors(anchors_path):
+    '''loads the anchors from a file'''
+    with open(anchors_path, encoding='utf-8') as f:
+        anchors = f.readline()
+    anchors = [float(x) for x in anchors.split(',')]
+    anchors = np.array(anchors).reshape(-1, 2)
+    return anchors, len(anchors)
+
+#---------------------------------------------------#
+#   获得学习率
+#---------------------------------------------------#
+def get_lr(optimizer):
+    for param_group in optimizer.param_groups:
+        return param_group['lr']
+    
+#---------------------------------------------------#
+#   设置种子
+#---------------------------------------------------#
+def seed_everything(seed=11):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+#---------------------------------------------------#
+#   设置Dataloader的种子
+#---------------------------------------------------#
+def worker_init_fn(worker_id, rank, seed):
+    worker_seed = rank + seed
+    random.seed(worker_seed)
+    np.random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+
+def preprocess_input(image):
+    image /= 255.0
+    return image
+
+def show_config(**kwargs):
+    print('Configurations:')
+    print('-' * 70)
+    print('|%25s | %40s|' % ('keys', 'values'))
+    print('-' * 70)
+    for key, value in kwargs.items():
+        print('|%25s | %40s|' % (str(key), str(value)))
+    print('-' * 70)
+        
+def download_weights(backbone, phi, model_dir="./model_data"):
+    import os
+    from torch.hub import load_state_dict_from_url
+    if backbone == "cspdarknet":
+        backbone = backbone + "_" + phi
+    
+    download_urls = {
+        "convnext_tiny"         : "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/convnext_tiny_1k_224_ema_no_jit.pth",
+        "convnext_small"        : "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/convnext_small_1k_224_ema_no_jit.pth",
+        "cspdarknet_s"          : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_s_backbone.pth',
+        'cspdarknet_m'          : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_m_backbone.pth',
+        'cspdarknet_l'          : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_l_backbone.pth',
+        'cspdarknet_x'          : 'https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/cspdarknet_x_backbone.pth',
+        'swin_transfomer_tiny'  : "https://github.com/bubbliiiing/yolov5-pytorch/releases/download/v1.0/swin_tiny_patch4_window7.pth",
+    }
+    url = download_urls[backbone]
+    
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    load_state_dict_from_url(url, model_dir)
--- a/utils/utils_bbox.py
+++ b/utils/utils_bbox.py
@@ -0,0 +1,637 @@
+import numpy as np
+import torch
+from torchvision.ops import nms
+
+
+class DecodeBox():
+    def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
+        super(DecodeBox, self).__init__()
+        self.anchors        = anchors
+        self.num_classes    = num_classes
+        self.bbox_attrs     = 5 + num_classes
+        self.input_shape    = input_shape
+        #-----------------------------------------------------------#
+        #   20x20的特征层对应的anchor是[116,90],[156,198],[373,326]
+        #   40x40的特征层对应的anchor是[30,61],[62,45],[59,119]
+        #   80x80的特征层对应的anchor是[10,13],[16,30],[33,23]
+        #-----------------------------------------------------------#
+        self.anchors_mask   = anchors_mask
+
+    def decode_box(self, inputs):
+        outputs = []
+        for i, input in enumerate(inputs):
+            #-----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size = 1
+            #   batch_size, 3 * (4 + 1 + 80), 20, 20
+            #   batch_size, 255, 40, 40
+            #   batch_size, 255, 80, 80
+            #-----------------------------------------------#
+            batch_size      = input.size(0)
+            input_height    = input.size(2)
+            input_width     = input.size(3)
+
+            #-----------------------------------------------#
+            #   输入为640x640时
+            #   stride_h = stride_w = 32、16、8
+            #-----------------------------------------------#
+            stride_h = self.input_shape[0] / input_height
+            stride_w = self.input_shape[1] / input_width
+            #-------------------------------------------------#
+            #   此时获得的scaled_anchors大小是相对于特征层的
+            #-------------------------------------------------#
+            scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors[self.anchors_mask[i]]]
+
+            #-----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 3, 20, 20, 85
+            #   batch_size, 3, 40, 40, 85
+            #   batch_size, 3, 80, 80, 85
+            #-----------------------------------------------#
+            prediction = input.view(batch_size, len(self.anchors_mask[i]),
+                                    self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
+
+            #-----------------------------------------------#
+            #   先验框的中心位置的调整参数
+            #-----------------------------------------------#
+            x = torch.sigmoid(prediction[..., 0])  
+            y = torch.sigmoid(prediction[..., 1])
+            #-----------------------------------------------#
+            #   先验框的宽高调整参数
+            #-----------------------------------------------#
+            w = torch.sigmoid(prediction[..., 2]) 
+            h = torch.sigmoid(prediction[..., 3]) 
+            #-----------------------------------------------#
+            #   获得置信度，是否有物体
+            #-----------------------------------------------#
+            conf        = torch.sigmoid(prediction[..., 4])
+            #-----------------------------------------------#
+            #   种类置信度
+            #-----------------------------------------------#
+            pred_cls    = torch.sigmoid(prediction[..., 5:])
+
+            FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+            LongTensor  = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+
+            #----------------------------------------------------------#
+            #   生成网格，先验框中心，网格左上角 
+            #   batch_size,3,20,20
+            #----------------------------------------------------------#
+            grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)
+            grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)
+
+            #----------------------------------------------------------#
+            #   按照网格格式生成先验框的宽高
+            #   batch_size,3,20,20
+            #----------------------------------------------------------#
+            anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
+            anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
+            anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
+            anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
+
+            #----------------------------------------------------------#
+            #   利用预测结果对先验框进行调整
+            #   首先调整先验框的中心，从先验框中心向右下角偏移
+            #   再调整先验框的宽高。
+            #   x 0 ~ 1 => 0 ~ 2 => -0.5, 1.5 => 负责一定范围的目标的预测
+            #   y 0 ~ 1 => 0 ~ 2 => -0.5, 1.5 => 负责一定范围的目标的预测
+            #   w 0 ~ 1 => 0 ~ 2 => 0 ~ 4 => 先验框的宽高调节范围为0~4倍
+            #   h 0 ~ 1 => 0 ~ 2 => 0 ~ 4 => 先验框的宽高调节范围为0~4倍
+            #----------------------------------------------------------#
+            pred_boxes          = FloatTensor(prediction[..., :4].shape)
+            pred_boxes[..., 0]  = x.data * 2. - 0.5 + grid_x
+            pred_boxes[..., 1]  = y.data * 2. - 0.5 + grid_y
+            pred_boxes[..., 2]  = (w.data * 2) ** 2 * anchor_w
+            pred_boxes[..., 3]  = (h.data * 2) ** 2 * anchor_h
+
+            #----------------------------------------------------------#
+            #   将输出结果归一化成小数的形式
+            #----------------------------------------------------------#
+            _scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)
+            output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,
+                                conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
+            outputs.append(output.data)
+        return outputs
+
+    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
+        #-----------------------------------------------------------------#
+        #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
+        #-----------------------------------------------------------------#
+        box_yx = box_xy[..., ::-1]
+        box_hw = box_wh[..., ::-1]
+        input_shape = np.array(input_shape)
+        image_shape = np.array(image_shape)
+
+        if letterbox_image:
+            #-----------------------------------------------------------------#
+            #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
+            #   new_shape指的是宽高缩放情况
+            #-----------------------------------------------------------------#
+            new_shape = np.round(image_shape * np.min(input_shape/image_shape))
+            offset  = (input_shape - new_shape)/2./input_shape
+            scale   = input_shape/new_shape
+
+            box_yx  = (box_yx - offset) * scale
+            box_hw *= scale
+
+        box_mins    = box_yx - (box_hw / 2.)
+        box_maxes   = box_yx + (box_hw / 2.)
+        boxes  = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
+        boxes *= np.concatenate([image_shape, image_shape], axis=-1)
+        return boxes
+
+    def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4):
+        #----------------------------------------------------------#
+        #   将预测结果的格式转换成左上角右下角的格式。
+        #   prediction  [batch_size, num_anchors, 85]
+        #----------------------------------------------------------#
+        box_corner          = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+        for i, image_pred in enumerate(prediction):
+            #----------------------------------------------------------#
+            #   对种类预测部分取max。
+            #   class_conf  [num_anchors, 1]    种类置信度
+            #   class_pred  [num_anchors, 1]    种类
+            #----------------------------------------------------------#
+            class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+            #----------------------------------------------------------#
+            #   利用置信度进行第一轮筛选
+            #----------------------------------------------------------#
+            conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
+
+            #----------------------------------------------------------#
+            #   根据置信度进行预测结果的筛选
+            #----------------------------------------------------------#
+            image_pred = image_pred[conf_mask]
+            class_conf = class_conf[conf_mask]
+            class_pred = class_pred[conf_mask]
+            if not image_pred.size(0):
+                continue
+            #-------------------------------------------------------------------------#
+            #   detections  [num_anchors, 7]
+            #   7的内容为：x1, y1, x2, y2, obj_conf, class_conf, class_pred
+            #-------------------------------------------------------------------------#
+            detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
+
+            #------------------------------------------#
+            #   获得预测结果中包含的所有种类
+            #------------------------------------------#
+            unique_labels = detections[:, -1].cpu().unique()
+
+            if prediction.is_cuda:
+                unique_labels = unique_labels.cuda()
+                detections = detections.cuda()
+
+            for c in unique_labels:
+                #------------------------------------------#
+                #   获得某一类得分筛选后全部的预测结果
+                #------------------------------------------#
+                detections_class = detections[detections[:, -1] == c]
+
+                #------------------------------------------#
+                #   使用官方自带的非极大抑制会速度更快一些！
+                #   筛选出一定区域内，属于同一种类得分最大的框
+                #------------------------------------------#
+                keep = nms(
+                    detections_class[:, :4],
+                    detections_class[:, 4] * detections_class[:, 5],
+                    nms_thres
+                )
+                max_detections = detections_class[keep]
+                
+                # # 按照存在物体的置信度排序
+                # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
+                # detections_class = detections_class[conf_sort_index]
+                # # 进行非极大抑制
+                # max_detections = []
+                # while detections_class.size(0):
+                #     # 取出这一类置信度最高的，一步一步往下判断，判断重合程度是否大于nms_thres，如果是则去除掉
+                #     max_detections.append(detections_class[0].unsqueeze(0))
+                #     if len(detections_class) == 1:
+                #         break
+                #     ious = bbox_iou(max_detections[-1], detections_class[1:])
+                #     detections_class = detections_class[1:][ious < nms_thres]
+                # # 堆叠
+                # max_detections = torch.cat(max_detections).data
+                
+                # Add max detections to outputs
+                output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
+            
+            if output[i] is not None:
+                output[i]           = output[i].cpu().numpy()
+                box_xy, box_wh      = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2]
+                output[i][:, :4]    = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
+        return output
+
+
+class DecodeBoxNP():
+    def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
+        super(DecodeBoxNP, self).__init__()
+        self.anchors        = anchors
+        self.num_classes    = num_classes
+        self.bbox_attrs     = 5 + num_classes
+        self.input_shape    = input_shape
+        self.anchors_mask   = anchors_mask
+
+    def sigmoid(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def decode_box(self, inputs):
+        outputs = []
+        for i, input in enumerate(inputs):
+            batch_size      = np.shape(input)[0]
+            input_height    = np.shape(input)[2]
+            input_width     = np.shape(input)[3]
+
+            #-----------------------------------------------#
+            #   输入为640x640时
+            #   stride_h = stride_w = 32、16、8
+            #-----------------------------------------------#
+            stride_h = self.input_shape[0] / input_height
+            stride_w = self.input_shape[1] / input_width
+            #-------------------------------------------------#
+            #   此时获得的scaled_anchors大小是相对于特征层的
+            #-------------------------------------------------#
+            scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors[self.anchors_mask[i]]]
+
+            #-----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 3, 20, 20, 85
+            #   batch_size, 3, 40, 40, 85
+            #   batch_size, 3, 80, 80, 85
+            #-----------------------------------------------#
+            prediction = np.transpose(np.reshape(input, (batch_size, len(self.anchors_mask[i]), self.bbox_attrs, input_height, input_width)), (0, 1, 3, 4, 2))
+
+            #-----------------------------------------------#
+            #   先验框的中心位置的调整参数
+            #-----------------------------------------------#
+            x = self.sigmoid(prediction[..., 0])  
+            y = self.sigmoid(prediction[..., 1])
+            #-----------------------------------------------#
+            #   先验框的宽高调整参数
+            #-----------------------------------------------#
+            w = self.sigmoid(prediction[..., 2]) 
+            h = self.sigmoid(prediction[..., 3]) 
+            #-----------------------------------------------#
+            #   获得置信度，是否有物体
+            #-----------------------------------------------#
+            conf        = self.sigmoid(prediction[..., 4])
+            #-----------------------------------------------#
+            #   种类置信度
+            #-----------------------------------------------#
+            pred_cls    = self.sigmoid(prediction[..., 5:])
+
+            #----------------------------------------------------------#
+            #   生成网格，先验框中心，网格左上角 
+            #   batch_size,3,20,20
+            #----------------------------------------------------------#
+            grid_x = np.repeat(np.expand_dims(np.repeat(np.expand_dims(np.linspace(0, input_width - 1, input_width), 0), input_height, axis=0), 0), batch_size * len(self.anchors_mask[i]), axis=0)
+            grid_x = np.reshape(grid_x, np.shape(x))
+            grid_y = np.repeat(np.expand_dims(np.repeat(np.expand_dims(np.linspace(0, input_height - 1, input_height), 0), input_width, axis=0).T, 0), batch_size * len(self.anchors_mask[i]), axis=0)
+            grid_y = np.reshape(grid_y, np.shape(y))
+    
+            #----------------------------------------------------------#
+            #   按照网格格式生成先验框的宽高
+            #   batch_size,3,20,20
+            #----------------------------------------------------------#
+            anchor_w = np.repeat(np.expand_dims(np.repeat(np.expand_dims(np.array(scaled_anchors)[:, 0], 0), batch_size, axis=0), -1), input_height * input_width, axis=-1)
+            anchor_h = np.repeat(np.expand_dims(np.repeat(np.expand_dims(np.array(scaled_anchors)[:, 1], 0), batch_size, axis=0), -1), input_height * input_width, axis=-1)
+            anchor_w = np.reshape(anchor_w, np.shape(w))
+            anchor_h = np.reshape(anchor_h, np.shape(h))
+            #----------------------------------------------------------#
+            #   利用预测结果对先验框进行调整
+            #   首先调整先验框的中心，从先验框中心向右下角偏移
+            #   再调整先验框的宽高。
+            #   x 0 ~ 1 => 0 ~ 2 => -0.5, 1.5 => 负责一定范围的目标的预测
+            #   y 0 ~ 1 => 0 ~ 2 => -0.5, 1.5 => 负责一定范围的目标的预测
+            #   w 0 ~ 1 => 0 ~ 2 => 0 ~ 4 => 先验框的宽高调节范围为0~4倍
+            #   h 0 ~ 1 => 0 ~ 2 => 0 ~ 4 => 先验框的宽高调节范围为0~4倍
+            #----------------------------------------------------------#
+            pred_boxes          = np.zeros(np.shape(prediction[..., :4]))
+            pred_boxes[..., 0]  = x * 2. - 0.5 + grid_x
+            pred_boxes[..., 1]  = y * 2. - 0.5 + grid_y
+            pred_boxes[..., 2]  = (w * 2) ** 2 * anchor_w
+            pred_boxes[..., 3]  = (h * 2) ** 2 * anchor_h
+
+            #----------------------------------------------------------#
+            #   将输出结果归一化成小数的形式
+            #----------------------------------------------------------#
+            _scale = np.array([input_width, input_height, input_width, input_height])
+            output = np.concatenate([np.reshape(pred_boxes, (batch_size, -1, 4)) / _scale,
+                                np.reshape(conf, (batch_size, -1, 1)), np.reshape(pred_cls, (batch_size, -1, self.num_classes))], -1)
+            outputs.append(output)
+        return outputs
+    
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+            计算IOU
+        """
+        if not x1y1x2y2:
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+
+        inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * \
+                    np.maximum(inter_rect_y2 - inter_rect_y1, 0)
+                    
+        b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
+        b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+        
+        iou = inter_area / np.maximum(b1_area + b2_area - inter_area, 1e-6)
+
+        return iou
+
+    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
+        #-----------------------------------------------------------------#
+        #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
+        #-----------------------------------------------------------------#
+        box_yx = box_xy[..., ::-1]
+        box_hw = box_wh[..., ::-1]
+        input_shape = np.array(input_shape)
+        image_shape = np.array(image_shape)
+
+        if letterbox_image:
+            #-----------------------------------------------------------------#
+            #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
+            #   new_shape指的是宽高缩放情况
+            #-----------------------------------------------------------------#
+            new_shape = np.round(image_shape * np.min(input_shape/image_shape))
+            offset  = (input_shape - new_shape)/2./input_shape
+            scale   = input_shape/new_shape
+
+            box_yx  = (box_yx - offset) * scale
+            box_hw *= scale
+
+        box_mins    = box_yx - (box_hw / 2.)
+        box_maxes   = box_yx + (box_hw / 2.)
+        boxes  = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
+        boxes *= np.concatenate([image_shape, image_shape], axis=-1)
+        return boxes
+
+    def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4):
+        #----------------------------------------------------------#
+        #   将预测结果的格式转换成左上角右下角的格式。
+        #   prediction  [batch_size, num_anchors, 85]
+        #----------------------------------------------------------#
+        box_corner          = np.zeros_like(prediction)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+        for i, image_pred in enumerate(prediction):
+            #----------------------------------------------------------#
+            #   对种类预测部分取max。
+            #   class_conf  [num_anchors, 1]    种类置信度
+            #   class_pred  [num_anchors, 1]    种类
+            #----------------------------------------------------------#
+            class_conf = np.max(image_pred[:, 5:5 + num_classes], 1, keepdims=True)
+            class_pred = np.expand_dims(np.argmax(image_pred[:, 5:5 + num_classes], 1), -1)
+
+            #----------------------------------------------------------#
+            #   利用置信度进行第一轮筛选
+            #----------------------------------------------------------#
+            conf_mask = np.squeeze((image_pred[:, 4] * class_conf[:, 0] >= conf_thres))
+
+            #----------------------------------------------------------#
+            #   根据置信度进行预测结果的筛选
+            #----------------------------------------------------------#
+            image_pred = image_pred[conf_mask]
+            class_conf = class_conf[conf_mask]
+            class_pred = class_pred[conf_mask]
+            if not np.shape(image_pred)[0]:
+                continue
+            #-------------------------------------------------------------------------#
+            #   detections  [num_anchors, 7]
+            #   7的内容为：x1, y1, x2, y2, obj_conf, class_conf, class_pred
+            #-------------------------------------------------------------------------#
+            detections = np.concatenate((image_pred[:, :5], class_conf, class_pred), 1)
+
+            #------------------------------------------#
+            #   获得预测结果中包含的所有种类
+            #------------------------------------------#
+            unique_labels = np.unique(detections[:, -1])
+
+            for c in unique_labels:
+                #------------------------------------------#
+                #   获得某一类得分筛选后全部的预测结果
+                #------------------------------------------#
+                detections_class = detections[detections[:, -1] == c]
+
+                # 按照存在物体的置信度排序
+                conf_sort_index     = np.argsort(detections_class[:, 4] * detections_class[:, 5])[::-1]
+                detections_class    = detections_class[conf_sort_index]
+                # 进行非极大抑制
+                max_detections = []
+                while np.shape(detections_class)[0]:
+                    # 取出这一类置信度最高的，一步一步往下判断，判断重合程度是否大于nms_thres，如果是则去除掉
+                    max_detections.append(detections_class[0:1])
+                    if len(detections_class) == 1:
+                        break
+                    ious                = self.bbox_iou(max_detections[-1], detections_class[1:])
+                    detections_class    = detections_class[1:][ious < nms_thres]
+                # 堆叠
+                max_detections = np.concatenate(max_detections, 0)
+                
+                # Add max detections to outputs
+                output[i] = max_detections if output[i] is None else np.concatenate((output[i], max_detections))
+            
+            if output[i] is not None:
+                output[i]           = output[i]
+                box_xy, box_wh      = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2]
+                output[i][:, :4]    = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
+        return output
+    
+
+if __name__ == "__main__":
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    #---------------------------------------------------#
+    #   将预测值的每个特征层调成真实值
+    #---------------------------------------------------#
+    def get_anchors_and_decode(input, input_shape, anchors, anchors_mask, num_classes):
+        #-----------------------------------------------#
+        #   input   batch_size, 3 * (4 + 1 + num_classes), 20, 20
+        #-----------------------------------------------#
+        batch_size      = input.size(0)
+        input_height    = input.size(2)
+        input_width     = input.size(3)
+
+        #-----------------------------------------------#
+        #   输入为640x640时 input_shape = [640, 640]  input_height = 20, input_width = 20
+        #   640 / 20 = 32
+        #   stride_h = stride_w = 32
+        #-----------------------------------------------#
+        stride_h = input_shape[0] / input_height
+        stride_w = input_shape[1] / input_width
+        #-------------------------------------------------#
+        #   此时获得的scaled_anchors大小是相对于特征层的
+        #   anchor_width, anchor_height / stride_h, stride_w
+        #-------------------------------------------------#
+        scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in anchors[anchors_mask[2]]]
+
+        #-----------------------------------------------#
+        #   batch_size, 3 * (4 + 1 + num_classes), 20, 20 => 
+        #   batch_size, 3, 5 + num_classes, 20, 20  => 
+        #   batch_size, 3, 20, 20, 4 + 1 + num_classes
+        #-----------------------------------------------#
+        prediction = input.view(batch_size, len(anchors_mask[2]),
+                                num_classes + 5, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
+
+        #-----------------------------------------------#
+        #   先验框的中心位置的调整参数
+        #-----------------------------------------------#
+        x = torch.sigmoid(prediction[..., 0])  
+        y = torch.sigmoid(prediction[..., 1])
+        #-----------------------------------------------#
+        #   先验框的宽高调整参数
+        #-----------------------------------------------#
+        w = torch.sigmoid(prediction[..., 2]) 
+        h = torch.sigmoid(prediction[..., 3]) 
+        #-----------------------------------------------#
+        #   获得置信度，是否有物体 0 - 1
+        #-----------------------------------------------#
+        conf        = torch.sigmoid(prediction[..., 4])
+        #-----------------------------------------------#
+        #   种类置信度 0 - 1
+        #-----------------------------------------------#
+        pred_cls    = torch.sigmoid(prediction[..., 5:])
+
+        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+        LongTensor  = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+
+        #----------------------------------------------------------#
+        #   生成网格，先验框中心，网格左上角 
+        #   batch_size,3,20,20
+        #   range(20)
+        #   [
+        #       [0, 1, 2, 3 ……, 19], 
+        #       [0, 1, 2, 3 ……, 19], 
+        #       …… （20次）
+        #       [0, 1, 2, 3 ……, 19]
+        #   ] * (batch_size * 3)
+        #   [batch_size, 3, 20, 20]
+        #   
+        #   [
+        #       [0, 1, 2, 3 ……, 19], 
+        #       [0, 1, 2, 3 ……, 19], 
+        #       …… （20次）
+        #       [0, 1, 2, 3 ……, 19]
+        #   ].T * (batch_size * 3)
+        #   [batch_size, 3, 20, 20]
+        #----------------------------------------------------------#
+        grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
+            batch_size * len(anchors_mask[2]), 1, 1).view(x.shape).type(FloatTensor)
+        grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
+            batch_size * len(anchors_mask[2]), 1, 1).view(y.shape).type(FloatTensor)
+
+        #----------------------------------------------------------#
+        #   按照网格格式生成先验框的宽高
+        #   batch_size, 3, 20 * 20 => batch_size, 3, 20, 20
+        #   batch_size, 3, 20 * 20 => batch_size, 3, 20, 20
+        #----------------------------------------------------------#
+        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
+        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
+        anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
+        anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
+
+        #----------------------------------------------------------#
+        #   利用预测结果对先验框进行调整
+        #   首先调整先验框的中心，从先验框中心向右下角偏移
+        #   再调整先验框的宽高。
+        #   x  0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_x
+        #   y  0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_y
+        #   w  0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_w
+        #   h  0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_h 
+        #----------------------------------------------------------#
+        pred_boxes          = FloatTensor(prediction[..., :4].shape)
+        pred_boxes[..., 0]  = x.data * 2. - 0.5 + grid_x
+        pred_boxes[..., 1]  = y.data * 2. - 0.5 + grid_y
+        pred_boxes[..., 2]  = (w.data * 2) ** 2 * anchor_w
+        pred_boxes[..., 3]  = (h.data * 2) ** 2 * anchor_h
+
+        point_h = 5
+        point_w = 5
+        
+        box_xy          = pred_boxes[..., 0:2].cpu().numpy() * 32
+        box_wh          = pred_boxes[..., 2:4].cpu().numpy() * 32
+        grid_x          = grid_x.cpu().numpy() * 32
+        grid_y          = grid_y.cpu().numpy() * 32
+        anchor_w        = anchor_w.cpu().numpy() * 32
+        anchor_h        = anchor_h.cpu().numpy() * 32
+        
+        fig = plt.figure()
+        ax  = fig.add_subplot(121)
+        from PIL import Image
+        img = Image.open("img/street.jpg").resize([640, 640])
+        plt.imshow(img, alpha=0.5)
+        plt.ylim(-30, 650)
+        plt.xlim(-30, 650)
+        plt.scatter(grid_x, grid_y)
+        plt.scatter(point_h * 32, point_w * 32, c='black')
+        plt.gca().invert_yaxis()
+
+        anchor_left = grid_x - anchor_w / 2
+        anchor_top  = grid_y - anchor_h / 2
+        
+        rect1 = plt.Rectangle([anchor_left[0, 0, point_h, point_w],anchor_top[0, 0, point_h, point_w]], \
+            anchor_w[0, 0, point_h, point_w],anchor_h[0, 0, point_h, point_w],color="r",fill=False)
+        rect2 = plt.Rectangle([anchor_left[0, 1, point_h, point_w],anchor_top[0, 1, point_h, point_w]], \
+            anchor_w[0, 1, point_h, point_w],anchor_h[0, 1, point_h, point_w],color="r",fill=False)
+        rect3 = plt.Rectangle([anchor_left[0, 2, point_h, point_w],anchor_top[0, 2, point_h, point_w]], \
+            anchor_w[0, 2, point_h, point_w],anchor_h[0, 2, point_h, point_w],color="r",fill=False)
+
+        ax.add_patch(rect1)
+        ax.add_patch(rect2)
+        ax.add_patch(rect3)
+
+        ax  = fig.add_subplot(122)
+        plt.imshow(img, alpha=0.5)
+        plt.ylim(-30, 650)
+        plt.xlim(-30, 650)
+        plt.scatter(grid_x, grid_y)
+        plt.scatter(point_h * 32, point_w * 32, c='black')
+        plt.scatter(box_xy[0, :, point_h, point_w, 0], box_xy[0, :, point_h, point_w, 1], c='r')
+        plt.gca().invert_yaxis()
+
+        pre_left    = box_xy[...,0] - box_wh[...,0] / 2
+        pre_top     = box_xy[...,1] - box_wh[...,1] / 2
+
+        rect1 = plt.Rectangle([pre_left[0, 0, point_h, point_w], pre_top[0, 0, point_h, point_w]],\
+            box_wh[0, 0, point_h, point_w,0], box_wh[0, 0, point_h, point_w,1],color="r",fill=False)
+        rect2 = plt.Rectangle([pre_left[0, 1, point_h, point_w], pre_top[0, 1, point_h, point_w]],\
+            box_wh[0, 1, point_h, point_w,0], box_wh[0, 1, point_h, point_w,1],color="r",fill=False)
+        rect3 = plt.Rectangle([pre_left[0, 2, point_h, point_w], pre_top[0, 2, point_h, point_w]],\
+            box_wh[0, 2, point_h, point_w,0], box_wh[0, 2, point_h, point_w,1],color="r",fill=False)
+
+        ax.add_patch(rect1)
+        ax.add_patch(rect2)
+        ax.add_patch(rect3)
+
+        plt.show()
+        #
+    feat            = torch.from_numpy(np.random.normal(0.2, 0.5, [4, 255, 20, 20])).float()
+    anchors         = np.array([[116, 90], [156, 198], [373, 326], [30,61], [62,45], [59,119], [10,13], [16,30], [33,23]])
+    anchors_mask    = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+    get_anchors_and_decode(feat, [640, 640], anchors, anchors_mask, 80)
--- a/utils/utils_fit.py
+++ b/utils/utils_fit.py
@@ -0,0 +1,149 @@
+import os
+
+import torch
+from tqdm import tqdm
+
+from utils.utils import get_lr
+        
+def fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0):
+    loss        = 0
+    val_loss    = 0
+
+    if local_rank == 0:
+        print('Start Train')
+        pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
+    model_train.train()
+    for iteration, batch in enumerate(gen):
+        if iteration >= epoch_step:
+            break
+
+        images, targets, y_trues = batch[0], batch[1], batch[2]
+        with torch.no_grad():
+            if cuda:
+                images  = images.cuda(local_rank)
+                targets = [ann.cuda(local_rank) for ann in targets]
+                y_trues = [ann.cuda(local_rank) for ann in y_trues]
+        #----------------------#
+        #   清零梯度
+        #----------------------#
+        optimizer.zero_grad()
+        if not fp16:
+            #----------------------#
+            #   前向传播
+            #----------------------#
+            outputs         = model_train(images)
+
+            loss_value_all  = 0
+            #----------------------#
+            #   计算损失
+            #----------------------#
+            for l in range(len(outputs)):
+                loss_item = yolo_loss(l, outputs[l], targets, y_trues[l])
+                loss_value_all  += loss_item
+            loss_value = loss_value_all
+
+            #----------------------#
+            #   反向传播
+            #----------------------#
+            loss_value.backward()
+            optimizer.step()
+        else:
+            from torch.cuda.amp import autocast
+            with autocast():
+                #----------------------#
+                #   前向传播
+                #----------------------#
+                outputs         = model_train(images)
+
+                loss_value_all  = 0
+                #----------------------#
+                #   计算损失
+                #----------------------#
+                for l in range(len(outputs)):
+                    loss_item = yolo_loss(l, outputs[l], targets, y_trues[l])
+                    loss_value_all  += loss_item
+                loss_value = loss_value_all
+
+            #----------------------#
+            #   反向传播
+            #----------------------#
+            scaler.scale(loss_value).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        if ema:
+            ema.update(model_train)
+
+        loss += loss_value.item()
+        
+        if local_rank == 0:
+            pbar.set_postfix(**{'loss'  : loss / (iteration + 1), 
+                                'lr'    : get_lr(optimizer)})
+            pbar.update(1)
+
+    if local_rank == 0:
+        pbar.close()
+        print('Finish Train')
+        print('Start Validation')
+        pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
+
+    if ema:
+        model_train_eval = ema.ema
+    else:
+        model_train_eval = model_train.eval()
+        
+    for iteration, batch in enumerate(gen_val):
+        if iteration >= epoch_step_val:
+            break
+        images, targets, y_trues = batch[0], batch[1], batch[2]
+        with torch.no_grad():
+            if cuda:
+                images  = images.cuda(local_rank)
+                targets = [ann.cuda(local_rank) for ann in targets]
+                y_trues = [ann.cuda(local_rank) for ann in y_trues]
+            #----------------------#
+            #   清零梯度
+            #----------------------#
+            optimizer.zero_grad()
+            #----------------------#
+            #   前向传播
+            #----------------------#
+            outputs         = model_train_eval(images)
+
+            loss_value_all  = 0
+            #----------------------#
+            #   计算损失
+            #----------------------#
+            for l in range(len(outputs)):
+                loss_item = yolo_loss(l, outputs[l], targets, y_trues[l])
+                loss_value_all  += loss_item
+            loss_value  = loss_value_all
+
+        val_loss += loss_value.item()
+        if local_rank == 0:
+            pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
+            pbar.update(1)
+            
+    if local_rank == 0:
+        pbar.close()
+        print('Finish Validation')
+        loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val)
+        eval_callback.on_epoch_end(epoch + 1, model_train_eval)
+        print('Epoch:'+ str(epoch + 1) + '/' + str(Epoch))
+        print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
+        
+        #-----------------------------------------------#
+        #   保存权值
+        #-----------------------------------------------#
+        if ema:
+            save_state_dict = ema.ema.state_dict()
+        else:
+            save_state_dict = model.state_dict()
+
+        if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch:
+            torch.save(save_state_dict, os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val)))
+            
+        if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss):
+            print('Save best model to best_epoch_weights.pth')
+            torch.save(save_state_dict, os.path.join(save_dir, "best_epoch_weights.pth"))
+            
+        torch.save(save_state_dict, os.path.join(save_dir, "last_epoch_weights.pth"))
--- a/utils/utils_map.py
+++ b/utils/utils_map.py
@@ -0,0 +1,923 @@
+import glob
+import json
+import math
+import operator
+import os
+import shutil
+import sys
+try:
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+except:
+    pass
+import cv2
+import matplotlib
+matplotlib.use('Agg')
+from matplotlib import pyplot as plt
+import numpy as np
+
+'''
+    0,0 ------> x (width)
+     |
+     |  (Left,Top)
+     |      *_________
+     |      |         |
+            |         |
+     y      |_________|
+  (height)            *
+                (Right,Bottom)
+'''
+
+def log_average_miss_rate(precision, fp_cumsum, num_images):
+    """
+        log-average miss rate:
+            Calculated by averaging miss rates at 9 evenly spaced FPPI points
+            between 10e-2 and 10e0, in log-space.
+
+        output:
+                lamr | log-average miss rate
+                mr | miss rate
+                fppi | false positives per image
+
+        references:
+            [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
+               State of the Art." Pattern Analysis and Machine Intelligence, IEEE
+               Transactions on 34.4 (2012): 743 - 761.
+    """
+
+    if precision.size == 0:
+        lamr = 0
+        mr = 1
+        fppi = 0
+        return lamr, mr, fppi
+
+    fppi = fp_cumsum / float(num_images)
+    mr = (1 - precision)
+
+    fppi_tmp = np.insert(fppi, 0, -1.0)
+    mr_tmp = np.insert(mr, 0, 1.0)
+
+    ref = np.logspace(-2.0, 0.0, num = 9)
+    for i, ref_i in enumerate(ref):
+        j = np.where(fppi_tmp <= ref_i)[-1][-1]
+        ref[i] = mr_tmp[j]
+
+    lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
+
+    return lamr, mr, fppi
+
+"""
+ throw error and exit
+"""
+def error(msg):
+    print(msg)
+    sys.exit(0)
+
+"""
+ check if the number is a float between 0.0 and 1.0
+"""
+def is_float_between_0_and_1(value):
+    try:
+        val = float(value)
+        if val > 0.0 and val < 1.0:
+            return True
+        else:
+            return False
+    except ValueError:
+        return False
+
+"""
+ Calculate the AP given the recall and precision array
+    1st) We compute a version of the measured precision/recall curve with
+         precision monotonically decreasing
+    2nd) We compute the AP as the area under this curve by numerical integration.
+"""
+def voc_ap(rec, prec):
+    """
+    --- Official matlab code VOC2012---
+    mrec=[0 ; rec ; 1];
+    mpre=[0 ; prec ; 0];
+    for i=numel(mpre)-1:-1:1
+            mpre(i)=max(mpre(i),mpre(i+1));
+    end
+    i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    rec.insert(0, 0.0) # insert 0.0 at begining of list
+    rec.append(1.0) # insert 1.0 at end of list
+    mrec = rec[:]
+    prec.insert(0, 0.0) # insert 0.0 at begining of list
+    prec.append(0.0) # insert 0.0 at end of list
+    mpre = prec[:]
+    """
+     This part makes the precision monotonically decreasing
+        (goes from the end to the beginning)
+        matlab: for i=numel(mpre)-1:-1:1
+                    mpre(i)=max(mpre(i),mpre(i+1));
+    """
+    for i in range(len(mpre)-2, -1, -1):
+        mpre[i] = max(mpre[i], mpre[i+1])
+    """
+     This part creates a list of indexes where the recall changes
+        matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    """
+    i_list = []
+    for i in range(1, len(mrec)):
+        if mrec[i] != mrec[i-1]:
+            i_list.append(i) # if it was matlab would be i + 1
+    """
+     The Average Precision (AP) is the area under the curve
+        (numerical integration)
+        matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    ap = 0.0
+    for i in i_list:
+        ap += ((mrec[i]-mrec[i-1])*mpre[i])
+    return ap, mrec, mpre
+
+
+"""
+ Convert the lines of a file to a list
+"""
+def file_lines_to_list(path):
+    # open txt file lines to a list
+    with open(path) as f:
+        content = f.readlines()
+    # remove whitespace characters like `\n` at the end of each line
+    content = [x.strip() for x in content]
+    return content
+
+"""
+ Draws text in image
+"""
+def draw_text_in_image(img, text, pos, color, line_width):
+    font = cv2.FONT_HERSHEY_PLAIN
+    fontScale = 1
+    lineType = 1
+    bottomLeftCornerOfText = pos
+    cv2.putText(img, text,
+            bottomLeftCornerOfText,
+            font,
+            fontScale,
+            color,
+            lineType)
+    text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
+    return img, (line_width + text_width)
+
+"""
+ Plot - adjust axes
+"""
+def adjust_axes(r, t, fig, axes):
+    # get text width for re-scaling
+    bb = t.get_window_extent(renderer=r)
+    text_width_inches = bb.width / fig.dpi
+    # get axis width in inches
+    current_fig_width = fig.get_figwidth()
+    new_fig_width = current_fig_width + text_width_inches
+    propotion = new_fig_width / current_fig_width
+    # get axis limit
+    x_lim = axes.get_xlim()
+    axes.set_xlim([x_lim[0], x_lim[1]*propotion])
+
+"""
+ Draw plot using Matplotlib
+"""
+def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
+    # sort the dictionary by decreasing value, into a list of tuples
+    sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
+    # unpacking the list of tuples into two lists
+    sorted_keys, sorted_values = zip(*sorted_dic_by_value)
+    # 
+    if true_p_bar != "":
+        """
+         Special case to draw in:
+            - green -> TP: True Positives (object detected and matches ground-truth)
+            - red -> FP: False Positives (object detected but does not match ground-truth)
+            - orange -> FN: False Negatives (object not detected but present in the ground-truth)
+        """
+        fp_sorted = []
+        tp_sorted = []
+        for key in sorted_keys:
+            fp_sorted.append(dictionary[key] - true_p_bar[key])
+            tp_sorted.append(true_p_bar[key])
+        plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
+        plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted)
+        # add legend
+        plt.legend(loc='lower right')
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf() # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            fp_val = fp_sorted[i]
+            tp_val = tp_sorted[i]
+            fp_str_val = " " + str(fp_val)
+            tp_str_val = fp_str_val + " " + str(tp_val)
+            # trick to paint multicolor with offset:
+            # first paint everything and then repaint the first number
+            t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
+            plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
+            if i == (len(sorted_values)-1): # largest bar
+                adjust_axes(r, t, fig, axes)
+    else:
+        plt.barh(range(n_classes), sorted_values, color=plot_color)
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf() # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            str_val = " " + str(val) # add a space before
+            if val < 1.0:
+                str_val = " {0:.2f}".format(val)
+            t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
+            # re-set axes to show number inside the figure
+            if i == (len(sorted_values)-1): # largest bar
+                adjust_axes(r, t, fig, axes)
+    # set window title
+    fig.canvas.set_window_title(window_title)
+    # write classes in y axis
+    tick_font_size = 12
+    plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
+    """
+     Re-scale height accordingly
+    """
+    init_height = fig.get_figheight()
+    # comput the matrix height in points and inches
+    dpi = fig.dpi
+    height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
+    height_in = height_pt / dpi
+    # compute the required figure height 
+    top_margin = 0.15 # in percentage of the figure height
+    bottom_margin = 0.05 # in percentage of the figure height
+    figure_height = height_in / (1 - top_margin - bottom_margin)
+    # set new height
+    if figure_height > init_height:
+        fig.set_figheight(figure_height)
+
+    # set plot title
+    plt.title(plot_title, fontsize=14)
+    # set axis titles
+    # plt.xlabel('classes')
+    plt.xlabel(x_label, fontsize='large')
+    # adjust size of window
+    fig.tight_layout()
+    # save the plot
+    fig.savefig(output_path)
+    # show image
+    if to_show:
+        plt.show()
+    # close the plot
+    plt.close()
+
+def get_map(MINOVERLAP, draw_plot, score_threhold=0.5, path = './map_out'):
+    GT_PATH             = os.path.join(path, 'ground-truth')
+    DR_PATH             = os.path.join(path, 'detection-results')
+    IMG_PATH            = os.path.join(path, 'images-optional')
+    TEMP_FILES_PATH     = os.path.join(path, '.temp_files')
+    RESULTS_FILES_PATH  = os.path.join(path, 'results')
+
+    show_animation = True
+    if os.path.exists(IMG_PATH): 
+        for dirpath, dirnames, files in os.walk(IMG_PATH):
+            if not files:
+                show_animation = False
+    else:
+        show_animation = False
+
+    if not os.path.exists(TEMP_FILES_PATH):
+        os.makedirs(TEMP_FILES_PATH)
+        
+    if os.path.exists(RESULTS_FILES_PATH):
+        shutil.rmtree(RESULTS_FILES_PATH)
+    else:
+        os.makedirs(RESULTS_FILES_PATH)
+    if draw_plot:
+        try:
+            matplotlib.use('TkAgg')
+        except:
+            pass
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision"))
+    if show_animation:
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one"))
+
+    ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
+    if len(ground_truth_files_list) == 0:
+        error("Error: No ground-truth files found!")
+    ground_truth_files_list.sort()
+    gt_counter_per_class     = {}
+    counter_images_per_class = {}
+
+    for txt_file in ground_truth_files_list:
+        file_id     = txt_file.split(".txt", 1)[0]
+        file_id     = os.path.basename(os.path.normpath(file_id))
+        temp_path   = os.path.join(DR_PATH, (file_id + ".txt"))
+        if not os.path.exists(temp_path):
+            error_msg = "Error. File not found: {}\n".format(temp_path)
+            error(error_msg)
+        lines_list      = file_lines_to_list(txt_file)
+        bounding_boxes  = []
+        is_difficult    = False
+        already_seen_classes = []
+        for line in lines_list:
+            try:
+                if "difficult" in line:
+                    class_name, left, top, right, bottom, _difficult = line.split()
+                    is_difficult = True
+                else:
+                    class_name, left, top, right, bottom = line.split()
+            except:
+                if "difficult" in line:
+                    line_split  = line.split()
+                    _difficult  = line_split[-1]
+                    bottom      = line_split[-2]
+                    right       = line_split[-3]
+                    top         = line_split[-4]
+                    left        = line_split[-5]
+                    class_name  = ""
+                    for name in line_split[:-5]:
+                        class_name += name + " "
+                    class_name  = class_name[:-1]
+                    is_difficult = True
+                else:
+                    line_split  = line.split()
+                    bottom      = line_split[-1]
+                    right       = line_split[-2]
+                    top         = line_split[-3]
+                    left        = line_split[-4]
+                    class_name  = ""
+                    for name in line_split[:-4]:
+                        class_name += name + " "
+                    class_name = class_name[:-1]
+
+            bbox = left + " " + top + " " + right + " " + bottom
+            if is_difficult:
+                bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True})
+                is_difficult = False
+            else:
+                bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False})
+                if class_name in gt_counter_per_class:
+                    gt_counter_per_class[class_name] += 1
+                else:
+                    gt_counter_per_class[class_name] = 1
+
+                if class_name not in already_seen_classes:
+                    if class_name in counter_images_per_class:
+                        counter_images_per_class[class_name] += 1
+                    else:
+                        counter_images_per_class[class_name] = 1
+                    already_seen_classes.append(class_name)
+
+        with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile:
+            json.dump(bounding_boxes, outfile)
+
+    gt_classes  = list(gt_counter_per_class.keys())
+    gt_classes  = sorted(gt_classes)
+    n_classes   = len(gt_classes)
+
+    dr_files_list = glob.glob(DR_PATH + '/*.txt')
+    dr_files_list.sort()
+    for class_index, class_name in enumerate(gt_classes):
+        bounding_boxes = []
+        for txt_file in dr_files_list:
+            file_id = txt_file.split(".txt",1)[0]
+            file_id = os.path.basename(os.path.normpath(file_id))
+            temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
+            if class_index == 0:
+                if not os.path.exists(temp_path):
+                    error_msg = "Error. File not found: {}\n".format(temp_path)
+                    error(error_msg)
+            lines = file_lines_to_list(txt_file)
+            for line in lines:
+                try:
+                    tmp_class_name, confidence, left, top, right, bottom = line.split()
+                except:
+                    line_split      = line.split()
+                    bottom          = line_split[-1]
+                    right           = line_split[-2]
+                    top             = line_split[-3]
+                    left            = line_split[-4]
+                    confidence      = line_split[-5]
+                    tmp_class_name  = ""
+                    for name in line_split[:-5]:
+                        tmp_class_name += name + " "
+                    tmp_class_name  = tmp_class_name[:-1]
+
+                if tmp_class_name == class_name:
+                    bbox = left + " " + top + " " + right + " " +bottom
+                    bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox})
+
+        bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True)
+        with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
+            json.dump(bounding_boxes, outfile)
+
+    sum_AP = 0.0
+    ap_dictionary = {}
+    lamr_dictionary = {}
+    with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file:
+        results_file.write("# AP and precision/recall per class\n")
+        count_true_positives = {}
+
+        for class_index, class_name in enumerate(gt_classes):
+            count_true_positives[class_name] = 0
+            dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
+            dr_data = json.load(open(dr_file))
+
+            nd          = len(dr_data)
+            tp          = [0] * nd
+            fp          = [0] * nd
+            score       = [0] * nd
+            score_threhold_idx = 0
+            for idx, detection in enumerate(dr_data):
+                file_id     = detection["file_id"]
+                score[idx]  = float(detection["confidence"])
+                if score[idx] >= score_threhold:
+                    score_threhold_idx = idx
+
+                if show_animation:
+                    ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
+                    if len(ground_truth_img) == 0:
+                        error("Error. Image not found with id: " + file_id)
+                    elif len(ground_truth_img) > 1:
+                        error("Error. Multiple image with id: " + file_id)
+                    else:
+                        img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
+                        img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0]
+                        if os.path.isfile(img_cumulative_path):
+                            img_cumulative = cv2.imread(img_cumulative_path)
+                        else:
+                            img_cumulative = img.copy()
+                        bottom_border = 60
+                        BLACK = [0, 0, 0]
+                        img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
+
+                gt_file             = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
+                ground_truth_data   = json.load(open(gt_file))
+                ovmax       = -1
+                gt_match    = -1
+                bb          = [float(x) for x in detection["bbox"].split()]
+                for obj in ground_truth_data:
+                    if obj["class_name"] == class_name:
+                        bbgt    = [ float(x) for x in obj["bbox"].split() ]
+                        bi      = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])]
+                        iw      = bi[2] - bi[0] + 1
+                        ih      = bi[3] - bi[1] + 1
+                        if iw > 0 and ih > 0:
+                            ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
+                                            + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
+                            ov = iw * ih / ua
+                            if ov > ovmax:
+                                ovmax = ov
+                                gt_match = obj
+
+                if show_animation:
+                    status = "NO MATCH FOUND!" 
+                    
+                min_overlap = MINOVERLAP
+                if ovmax >= min_overlap:
+                    if "difficult" not in gt_match:
+                        if not bool(gt_match["used"]):
+                            tp[idx] = 1
+                            gt_match["used"] = True
+                            count_true_positives[class_name] += 1
+                            with open(gt_file, 'w') as f:
+                                    f.write(json.dumps(ground_truth_data))
+                            if show_animation:
+                                status = "MATCH!"
+                        else:
+                            fp[idx] = 1
+                            if show_animation:
+                                status = "REPEATED MATCH!"
+                else:
+                    fp[idx] = 1
+                    if ovmax > 0:
+                        status = "INSUFFICIENT OVERLAP"
+
+                """
+                Draw image to show animation
+                """
+                if show_animation:
+                    height, widht = img.shape[:2]
+                    white           = (255,255,255)
+                    light_blue      = (255,200,100)
+                    green           = (0,255,0)
+                    light_red       = (30,30,255)
+                    margin          = 10
+                    # 1nd line
+                    v_pos           = int(height - margin - (bottom_border / 2.0))
+                    text            = "Image: " + ground_truth_img[0] + " "
+                    img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                    text            = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
+                    img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width)
+                    if ovmax != -1:
+                        color       = light_red
+                        if status   == "INSUFFICIENT OVERLAP":
+                            text    = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100)
+                        else:
+                            text    = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100)
+                            color   = green
+                        img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+                    # 2nd line
+                    v_pos           += int(bottom_border / 2.0)
+                    rank_pos        = str(idx+1)
+                    text            = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100)
+                    img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                    color           = light_red
+                    if status == "MATCH!":
+                        color = green
+                    text            = "Result: " + status + " "
+                    img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    if ovmax > 0: 
+                        bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ]
+                        cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
+                        cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
+                        cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA)
+                    bb = [int(i) for i in bb]
+                    cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
+                    cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
+                    cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
+
+                    cv2.imshow("Animation", img)
+                    cv2.waitKey(20) 
+                    output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg"
+                    cv2.imwrite(output_img_path, img)
+                    cv2.imwrite(img_cumulative_path, img_cumulative)
+
+            cumsum = 0
+            for idx, val in enumerate(fp):
+                fp[idx] += cumsum
+                cumsum += val
+                
+            cumsum = 0
+            for idx, val in enumerate(tp):
+                tp[idx] += cumsum
+                cumsum += val
+
+            rec = tp[:]
+            for idx, val in enumerate(tp):
+                rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1)
+
+            prec = tp[:]
+            for idx, val in enumerate(tp):
+                prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1)
+
+            ap, mrec, mprec = voc_ap(rec[:], prec[:])
+            F1  = np.array(rec)*np.array(prec)*2 / np.where((np.array(prec)+np.array(rec))==0, 1, (np.array(prec)+np.array(rec)))
+
+            sum_AP  += ap
+            text    = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100)
+
+            if len(prec)>0:
+                F1_text         = "{0:.2f}".format(F1[score_threhold_idx]) + " = " + class_name + " F1 "
+                Recall_text     = "{0:.2f}%".format(rec[score_threhold_idx]*100) + " = " + class_name + " Recall "
+                Precision_text  = "{0:.2f}%".format(prec[score_threhold_idx]*100) + " = " + class_name + " Precision "
+            else:
+                F1_text         = "0.00" + " = " + class_name + " F1 " 
+                Recall_text     = "0.00%" + " = " + class_name + " Recall " 
+                Precision_text  = "0.00%" + " = " + class_name + " Precision " 
+
+            rounded_prec    = [ '%.2f' % elem for elem in prec ]
+            rounded_rec     = [ '%.2f' % elem for elem in rec ]
+            results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
+            
+            if len(prec)>0:
+                print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=" + "{0:.2f}".format(F1[score_threhold_idx])\
+                    + " ; Recall=" + "{0:.2f}%".format(rec[score_threhold_idx]*100) + " ; Precision=" + "{0:.2f}%".format(prec[score_threhold_idx]*100))
+            else:
+                print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=0.00% ; Recall=0.00% ; Precision=0.00%")
+            ap_dictionary[class_name] = ap
+
+            n_images = counter_images_per_class[class_name]
+            lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images)
+            lamr_dictionary[class_name] = lamr
+
+            if draw_plot:
+                plt.plot(rec, prec, '-o')
+                area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
+                area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
+                plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
+
+                fig = plt.gcf()
+                fig.canvas.set_window_title('AP ' + class_name)
+
+                plt.title('class: ' + text)
+                plt.xlabel('Recall')
+                plt.ylabel('Precision')
+                axes = plt.gca()
+                axes.set_xlim([0.0,1.0])
+                axes.set_ylim([0.0,1.05]) 
+                fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, F1, "-", color='orangered')
+                plt.title('class: ' + F1_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('F1')
+                axes = plt.gca()
+                axes.set_xlim([0.0,1.0])
+                axes.set_ylim([0.0,1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, rec, "-H", color='gold')
+                plt.title('class: ' + Recall_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('Recall')
+                axes = plt.gca()
+                axes.set_xlim([0.0,1.0])
+                axes.set_ylim([0.0,1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, prec, "-s", color='palevioletred')
+                plt.title('class: ' + Precision_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('Precision')
+                axes = plt.gca()
+                axes.set_xlim([0.0,1.0])
+                axes.set_ylim([0.0,1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png")
+                plt.cla()
+                
+        if show_animation:
+            cv2.destroyAllWindows()
+        if n_classes == 0:
+            print("未检测到任何种类，请检查标签信息与get_map.py中的classes_path是否修改。")
+            return 0
+        results_file.write("\n# mAP of all classes\n")
+        mAP     = sum_AP / n_classes
+        text    = "mAP = {0:.2f}%".format(mAP*100)
+        results_file.write(text + "\n")
+        print(text)
+
+    shutil.rmtree(TEMP_FILES_PATH)
+
+    """
+    Count total of detection-results
+    """
+    det_counter_per_class = {}
+    for txt_file in dr_files_list:
+        lines_list = file_lines_to_list(txt_file)
+        for line in lines_list:
+            class_name = line.split()[0]
+            if class_name in det_counter_per_class:
+                det_counter_per_class[class_name] += 1
+            else:
+                det_counter_per_class[class_name] = 1
+    dr_classes = list(det_counter_per_class.keys())
+
+    """
+    Write number of ground-truth objects per class to results.txt
+    """
+    with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
+        results_file.write("\n# Number of ground-truth objects per class\n")
+        for class_name in sorted(gt_counter_per_class):
+            results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
+
+    """
+    Finish counting true positives
+    """
+    for class_name in dr_classes:
+        if class_name not in gt_classes:
+            count_true_positives[class_name] = 0
+
+    """
+    Write number of detected objects per class to results.txt
+    """
+    with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
+        results_file.write("\n# Number of detected objects per class\n")
+        for class_name in sorted(dr_classes):
+            n_det = det_counter_per_class[class_name]
+            text = class_name + ": " + str(n_det)
+            text += " (tp:" + str(count_true_positives[class_name]) + ""
+            text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
+            results_file.write(text)
+
+    """
+    Plot the total number of occurences of each class in the ground-truth
+    """
+    if draw_plot:
+        window_title = "ground-truth-info"
+        plot_title = "ground-truth\n"
+        plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
+        x_label = "Number of objects per class"
+        output_path = RESULTS_FILES_PATH + "/ground-truth-info.png"
+        to_show = False
+        plot_color = 'forestgreen'
+        draw_plot_func(
+            gt_counter_per_class,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            '',
+            )
+
+    # """
+    # Plot the total number of occurences of each class in the "detection-results" folder
+    # """
+    # if draw_plot:
+    #     window_title = "detection-results-info"
+    #     # Plot title
+    #     plot_title = "detection-results\n"
+    #     plot_title += "(" + str(len(dr_files_list)) + " files and "
+    #     count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
+    #     plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
+    #     # end Plot title
+    #     x_label = "Number of objects per class"
+    #     output_path = RESULTS_FILES_PATH + "/detection-results-info.png"
+    #     to_show = False
+    #     plot_color = 'forestgreen'
+    #     true_p_bar = count_true_positives
+    #     draw_plot_func(
+    #         det_counter_per_class,
+    #         len(det_counter_per_class),
+    #         window_title,
+    #         plot_title,
+    #         x_label,
+    #         output_path,
+    #         to_show,
+    #         plot_color,
+    #         true_p_bar
+    #         )
+
+    """
+    Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
+    """
+    if draw_plot:
+        window_title = "lamr"
+        plot_title = "log-average miss rate"
+        x_label = "log-average miss rate"
+        output_path = RESULTS_FILES_PATH + "/lamr.png"
+        to_show = False
+        plot_color = 'royalblue'
+        draw_plot_func(
+            lamr_dictionary,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            ""
+            )
+
+    """
+    Draw mAP plot (Show AP's of all classes in decreasing order)
+    """
+    if draw_plot:
+        window_title = "mAP"
+        plot_title = "mAP = {0:.2f}%".format(mAP*100)
+        x_label = "Average Precision"
+        output_path = RESULTS_FILES_PATH + "/mAP.png"
+        to_show = True
+        plot_color = 'royalblue'
+        draw_plot_func(
+            ap_dictionary,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            ""
+            )
+    return mAP
+
+def preprocess_gt(gt_path, class_names):
+    image_ids   = os.listdir(gt_path)
+    results = {}
+
+    images = []
+    bboxes = []
+    for i, image_id in enumerate(image_ids):
+        lines_list      = file_lines_to_list(os.path.join(gt_path, image_id))
+        boxes_per_image = []
+        image           = {}
+        image_id        = os.path.splitext(image_id)[0]
+        image['file_name'] = image_id + '.jpg'
+        image['width']     = 1
+        image['height']    = 1
+        #-----------------------------------------------------------------#
+        #   感谢 多学学英语吧 的提醒
+        #   解决了'Results do not correspond to current coco set'问题
+        #-----------------------------------------------------------------#
+        image['id']        = str(image_id)
+
+        for line in lines_list:
+            difficult = 0 
+            if "difficult" in line:
+                line_split  = line.split()
+                left, top, right, bottom, _difficult = line_split[-5:]
+                class_name  = ""
+                for name in line_split[:-5]:
+                    class_name += name + " "
+                class_name  = class_name[:-1]
+                difficult = 1
+            else:
+                line_split  = line.split()
+                left, top, right, bottom = line_split[-4:]
+                class_name  = ""
+                for name in line_split[:-4]:
+                    class_name += name + " "
+                class_name = class_name[:-1]
+            
+            left, top, right, bottom = float(left), float(top), float(right), float(bottom)
+            if class_name not in class_names:
+                continue
+            cls_id  = class_names.index(class_name) + 1
+            bbox    = [left, top, right - left, bottom - top, difficult, str(image_id), cls_id, (right - left) * (bottom - top) - 10.0]
+            boxes_per_image.append(bbox)
+        images.append(image)
+        bboxes.extend(boxes_per_image)
+    results['images']        = images
+
+    categories = []
+    for i, cls in enumerate(class_names):
+        category = {}
+        category['supercategory']   = cls
+        category['name']            = cls
+        category['id']              = i + 1
+        categories.append(category)
+    results['categories']   = categories
+
+    annotations = []
+    for i, box in enumerate(bboxes):
+        annotation = {}
+        annotation['area']        = box[-1]
+        annotation['category_id'] = box[-2]
+        annotation['image_id']    = box[-3]
+        annotation['iscrowd']     = box[-4]
+        annotation['bbox']        = box[:4]
+        annotation['id']          = i
+        annotations.append(annotation)
+    results['annotations'] = annotations
+    return results
+
+def preprocess_dr(dr_path, class_names):
+    image_ids = os.listdir(dr_path)
+    results = []
+    for image_id in image_ids:
+        lines_list      = file_lines_to_list(os.path.join(dr_path, image_id))
+        image_id        = os.path.splitext(image_id)[0]
+        for line in lines_list:
+            line_split  = line.split()
+            confidence, left, top, right, bottom = line_split[-5:]
+            class_name  = ""
+            for name in line_split[:-5]:
+                class_name += name + " "
+            class_name  = class_name[:-1]
+            left, top, right, bottom = float(left), float(top), float(right), float(bottom)
+            result                  = {}
+            result["image_id"]      = str(image_id)
+            if class_name not in class_names:
+                continue
+            result["category_id"]   = class_names.index(class_name) + 1
+            result["bbox"]          = [left, top, right - left, bottom - top]
+            result["score"]         = float(confidence)
+            results.append(result)
+    return results
+ 
+def get_coco_map(class_names, path):
+    GT_PATH     = os.path.join(path, 'ground-truth')
+    DR_PATH     = os.path.join(path, 'detection-results')
+    COCO_PATH   = os.path.join(path, 'coco_eval')
+
+    if not os.path.exists(COCO_PATH):
+        os.makedirs(COCO_PATH)
+
+    GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json')
+    DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json')
+
+    with open(GT_JSON_PATH, "w") as f:
+        results_gt  = preprocess_gt(GT_PATH, class_names)
+        json.dump(results_gt, f, indent=4)
+
+    with open(DR_JSON_PATH, "w") as f:
+        results_dr  = preprocess_dr(DR_PATH, class_names)
+        json.dump(results_dr, f, indent=4)
+        if len(results_dr) == 0:
+            print("未检测到任何目标。")
+            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+    cocoGt      = COCO(GT_JSON_PATH)
+    cocoDt      = cocoGt.loadRes(DR_JSON_PATH)
+    cocoEval    = COCOeval(cocoGt, cocoDt, 'bbox') 
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
+
+    return cocoEval.stats
--- a/voc_annotation.py
+++ b/voc_annotation.py
@@ -0,0 +1,153 @@
+import os
+import random
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+from utils.utils import get_classes
+
+#--------------------------------------------------------------------------------------------------------------------------------#
+#   annotation_mode用于指定该文件运行时计算的内容
+#   annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
+#   annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
+#   annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
+#--------------------------------------------------------------------------------------------------------------------------------#
+annotation_mode     = 0
+#-------------------------------------------------------------------#
+#   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
+#   与训练和预测所用的classes_path一致即可
+#   如果生成的2007_train.txt里面没有目标信息
+#   那么就是因为classes没有设定正确
+#   仅在annotation_mode为0和2的时候有效
+#-------------------------------------------------------------------#
+classes_path        = 'trainYolov5-v6/model_data/voc_classes.txt'
+#--------------------------------------------------------------------------------------------------------------------------------#
+#   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1
+#   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1
+#   仅在annotation_mode为0和1的时候有效
+#--------------------------------------------------------------------------------------------------------------------------------#
+trainval_percent    = 0.9
+train_percent       = 0.9
+#-------------------------------------------------------#
+#   指向VOC数据集所在的文件夹
+#   默认指向根目录下的VOC数据集
+#-------------------------------------------------------#
+VOCdevkit_path  = 'Data/TrainData'
+
+VOCdevkit_sets  = [('2007', 'train'), ('2007', 'val')]
+classes, _      = get_classes(classes_path)
+
+#-------------------------------------------------------#
+#   统计目标数量
+#-------------------------------------------------------#
+photo_nums  = np.zeros(len(VOCdevkit_sets))
+nums        = np.zeros(len(classes))
+def convert_annotation(year, image_id, list_file):
+    in_file = open(os.path.join(VOCdevkit_path, 'Annotations/%s.xml'%(image_id)), encoding='utf-8')
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+
+    for obj in root.iter('object'):
+        difficult = 0 
+        if obj.find('difficult')!=None:
+            difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult)==1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
+        list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
+        
+        nums[classes.index(cls)] = nums[classes.index(cls)] + 1
+        
+if __name__ == "__main__":
+    random.seed(0)
+    if " " in os.path.abspath(VOCdevkit_path):
+        raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格，否则会影响正常的模型训练，请注意修改。")
+
+    if annotation_mode == 0 or annotation_mode == 1:
+        print("Generate txt in ImageSets.")
+        xmlfilepath     = os.path.join(VOCdevkit_path, 'Annotations')
+        saveBasePath    = os.path.join(VOCdevkit_path, 'ImageSets/Main')
+        temp_xml        = os.listdir(xmlfilepath)
+        total_xml       = []
+        for xml in temp_xml:
+            if xml.endswith(".xml"):
+                total_xml.append(xml)
+
+        num     = len(total_xml)  
+        list    = range(num)  
+        tv      = int(num*trainval_percent)  
+        tr      = int(tv*train_percent)  
+        trainval= random.sample(list,tv)  
+        train   = random.sample(trainval,tr)  
+        
+        print("train and val size",tv)
+        print("train size",tr)
+        ftrainval   = open(os.path.join(saveBasePath,'trainval.txt'), 'w')  
+        ftest       = open(os.path.join(saveBasePath,'test.txt'), 'w')  
+        ftrain      = open(os.path.join(saveBasePath,'train.txt'), 'w')  
+        fval        = open(os.path.join(saveBasePath,'val.txt'), 'w')  
+        
+        for i in list:  
+            name=total_xml[i][:-4]+'\n'  
+            if i in trainval:  
+                ftrainval.write(name)  
+                if i in train:  
+                    ftrain.write(name)  
+                else:  
+                    fval.write(name)  
+            else:  
+                ftest.write(name)  
+        
+        ftrainval.close()  
+        ftrain.close()  
+        fval.close()  
+        ftest.close()
+        print("Generate txt in ImageSets done.")
+
+    if annotation_mode == 0 or annotation_mode == 2:
+        print("Generate 2007_train.txt and 2007_val.txt for train.")
+        type_index = 0
+        for year, image_set in VOCdevkit_sets:
+            image_ids = open(os.path.join(VOCdevkit_path, 'ImageSets/Main/%s.txt'%(image_set)), encoding='utf-8').read().strip().split()
+            list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
+            for image_id in image_ids:
+                list_file.write('%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), image_id))
+
+                convert_annotation(year, image_id, list_file)
+                list_file.write('\n')
+            photo_nums[type_index] = len(image_ids)
+            type_index += 1
+            list_file.close()
+        print("Generate 2007_train.txt and 2007_val.txt for train done.")
+        
+        def printTable(List1, List2):
+            for i in range(len(List1[0])):
+                print("|", end=' ')
+                for j in range(len(List1)):
+                    print(List1[j][i].rjust(int(List2[j])), end=' ')
+                    print("|", end=' ')
+                print()
+
+        str_nums = [str(int(x)) for x in nums]
+        tableData = [
+            classes, str_nums
+        ]
+        colWidths = [0]*len(tableData)
+        len1 = 0
+        for i in range(len(tableData)):
+            for j in range(len(tableData[i])):
+                if len(tableData[i][j]) > colWidths[i]:
+                    colWidths[i] = len(tableData[i][j])
+        printTable(tableData, colWidths)
+
+        if photo_nums[0] <= 500:
+            print("训练集数量小于500，属于较小的数据量，请注意设置较大的训练世代（Epoch）以满足足够的梯度下降次数（Step）。")
+
+        if np.sum(nums) == 0:
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("（重要的事情说三遍）。")
				`@@ -0,0 +1 @@`
				`10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326`