Source code for ssds.core.criterion

import torch
import torch.nn as nn
import torch.nn.functional as F

import math


[docs]class MultiBoxLoss(nn.Module):
    r"""The MultiBox Loss is used to calculate the classification loss in object detection task.

    MultiBox Loss is introduce by [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325v5) and can be described as:

    .. math::
        L(x,c,l,g) = (Lconf(x, c) + \alpha Lloc(x,l,g)) / N

    where, :math:`Lconf` is the CrossEntropy Loss and :math:`Lloc` is the SmoothL1 Loss
    weighted by :math:`\alpha` which is set to 1 by cross val.

    Compute Targets:

    * Produce Confidence Target
        Indices by matching ground truth boxes
        with (default) 'priorboxes' that have jaccard index > threshold parameter
        (default threshold: 0.5).
    * Produce localization target 
        by 'encoding' variance into offsets of ground
        truth boxes and their matched  'priorboxes'.
    * Hard negative mining 
        to filter the excessive number of negative examples
        that comes with using a large number of default bounding boxes.
        (default negative:positive ratio 3:1)
    
    To reduce the code and make it more easier to embed into the pipeline. Here, only the classification loss is included in this class
    
    Args:
        negpos_ratio: ratio of negative over positive samples in the given feature map, Default: 3
    """

    def __init__(self, negpos_ratio=3, **kwargs):
        super(MultiBoxLoss, self).__init__()
        self.negpos_ratio = negpos_ratio

[docs]    def forward(self, pred_logits, target, depth):
        """
        Args:
            pred_logits: Predict class for each box
            target: Target class for each box
            depth: the sign for the positive and negative samples from anchor mathcing. \
                Basically it can be splited to 3 types: positive(>0), background/negative(=0), ignore(<0)
        Returns:
            The classification loss for the given feature map
        """
                

        pred = pred_logits.sigmoid()
        ce = F.binary_cross_entropy_with_logits(pred_logits, target, reduction="none")

        # Hard Negative Mining
        max_ce = ce.max(2)[0].view(ce.shape[0], -1)
        depth_v = depth.view(ce.shape[0], -1)
        max_ce[depth_v != 0] = 0  # include the pos and ignore
        _, idx = max_ce.sort(1, descending=True)
        _, idx_rank = idx.sort(1)

        # select top n neg
        num_pos = (depth_v > 0).sum((1))
        num_neg = torch.clamp(self.negpos_ratio * num_pos, max=depth_v.shape[1] - 1)
        neg = idx_rank < num_neg.expand_as(idx_rank)
        neg = neg.view_as(depth)

        return ce * ((depth > 0) + neg).gt(0).expand_as(ce)


[docs]class FocalLoss(nn.Module):
    r"""The Focal Loss is used to calculate the classification loss in object detection task.
    
    Focal Loss is introduce by [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) and can be described as:

    .. math::
        FL(p_t)=-\alpha(1-p_t)^{\gamma}ln(p_t)
    
    where :math:`p_t` is the cross entropy for each box. :math:`\alpha` controls the ratio of positive sample and the :math:`\gamma`
    controls the attention for the difficult samples.

    Args:
        alpha (float) : the param to control the ratio of positive sample, (0,1). Default: 0.25
        gamma (float) : the param to the attention for the difficult samples, [0,n), [0,5] has been shown in the original paper. Default: 2
    """

    def __init__(self, alpha=0.25, gamma=2, **kwargs):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

[docs]    def forward(self, pred_logits, target, depth):
        r"""
        Args:
            pred_logits: Predict class for each box
            target: Target class for each box
            depth: Does not used in this function
        Returns:
            The classification loss for the given feature map
        """
        pred = pred_logits.sigmoid()
        ce = F.binary_cross_entropy_with_logits(pred_logits, target, reduction="none")
        alpha = target * self.alpha + (1.0 - target) * (1.0 - self.alpha)
        pt = torch.where(target == 1, pred, 1 - pred)
        return alpha * (1.0 - pt) ** self.gamma * ce


[docs]class SmoothL1Loss(nn.Module):
    r"""The SmoothL1 Loss is used to calculate the localization loss in object detection task.
    
    This criterion that uses a squared term if the absolute
    element-wise error falls below 1 and an L1 term otherwise.
    It is less sensitive to outliers than the `MSELoss` and in some cases
    prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
    Also known as the Huber loss:

    .. math::
        \text{loss}(x_i, y_i) =
        \begin{cases}
        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < \beta \\
        |x_i - y_i| - 0.5, & \text{otherwise }
        \end{cases}
    
    :math:`x` and :math:`y` arbitrary shapes with a total of :math:`n` elements each
    the sum operation still operates over all the elements, and divides by :math:`n`.

    :math:`\beta` is used as the threshold and smooth the loss

    Args:
        beta (float) : the param to control the threshold and smooth the loss, (0,1). Default: 0.11
    """

    def __init__(self, beta=0.11):
        super().__init__()
        self.beta = beta

[docs]    def forward(self, pred, target):
        r"""
        Args:
            pred: Predict box for each box
            target: Target box for each box
        Returns:
            The localization loss for the given feature map
        """
        x = (pred - target).abs()
        l1 = x - 0.5 * self.beta
        l2 = 0.5 * x ** 2 / self.beta
        return torch.where(x >= self.beta, l1, l2)


[docs]class IOULoss(nn.Module):
    r"""The IOU Loss is used to calculate the localization loss in object detection task.

    IoU Loss is introduce by [IoU Loss for 2D/3D Object Detection](https://arxiv.org/abs/1908.03851v1) and can be described as:

    .. math::
        IoU(A, B) = \frac{A \cap B}{A \cup B} = \frac{A \cap B}{|A| + |B| - A \cap  B}
    
    where, A and B represents the two convex shapes. In here, it means the predict box and the groundtruth box.

    This class actually implemented multiple IoU related losses and use :attr:`loss_type` to choose the specific loss func.

    Args:
        loss_type (str): param to choose the specific loss type.
    """
    def __init__(self, loss_type="iou"):
        super(IOULoss, self).__init__()
        self.loss_type = loss_type

[docs]    def forward(self, pred, target):
        r"""
        Args:
            pred: Predict box for each box, format with x,y,w,h
            target: Target box for each box, format with x,y,w,h
        Returns:
            The localization loss for the given feature map
        """
        pred_lt, pred_rb, pred_wh = self.delta2ltrb(pred)
        target_lt, target_rb, target_wh = self.delta2ltrb(target)

        lt = torch.max(pred_lt, target_lt)
        rb = torch.min(pred_rb, target_rb)

        area_i = torch.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
        area_a = torch.prod(pred_wh, axis=2)
        area_b = torch.prod(target_wh, axis=2)

        area_union = area_a + area_b - area_i
        iou = (area_i + 1e-7) / (area_union + 1e-7)

        if self.loss_type == "iou":
            iou = torch.clamp(iou, min=0, max=1.0).unsqueeze(2)
            return 1 - iou

        outer_lt = torch.min(pred_lt, target_lt)
        outer_rb = torch.max(pred_rb, target_rb)

        if self.loss_type == "giou":
            area_outer = (
                torch.prod(outer_rb - outer_lt, axis=2)
                * (outer_lt < outer_rb).all(axis=2)
                + 1e-7
            )
            giou = iou - (area_outer - area_union) / area_outer
            giou = torch.clamp(giou, min=-1.0, max=1.0).unsqueeze(2)
            return 1 - giou

        inter_diag = ((pred[:, :, :2] - target[:, :, :2]) ** 2).sum(dim=2)
        outer_diag = ((outer_rb - outer_lt) ** 2).sum(dim=2) + 1e-7

        if self.loss_type == "diou":
            diou = iou - inter_diag / outer_diag
            diou = torch.clamp(diou, min=-1.0, max=1.0).unsqueeze(2)
            return 1 - diou

        if self.loss_type == "ciou":
            v = (4 / (math.pi ** 2)) * torch.pow(
                (
                    torch.atan(target_wh[:, :, 0] / target_wh[:, :, 1])
                    - torch.atan(pred_wh[:, :, 0] / pred_wh[:, :, 1])
                ),
                2,
            )
            with torch.no_grad():
                S = 1 - iou
                alpha = v / (S + v)
            ciou = iou - (inter_diag / outer_diag + alpha * v)
            ciou = torch.clamp(ciou, min=-1.0, max=1.0).unsqueeze(2)
            return 1 - ciou

[docs]    def delta2ltrb(self, deltas):
        """ deltas [x,y,w,h] with [batch, anchor, 4, h, w]
        """
        pred_ctr = deltas[:, :, :2]
        pred_wh = torch.exp(deltas[:, :, 2:])
        return pred_ctr - 0.5 * pred_wh, pred_ctr + 0.5 * pred_wh, pred_wh


[docs]def GIOULoss():
    r"""The GIOU Loss is used to calculate the localization loss in object detection task.

    Generalized IoU Loss is introduce by [IoU Loss for 2D/3D Object Detection](https://arxiv.org/abs/1908.03851v1) and can be described as:

    .. math::
        IoU(A, B) = \frac{A \cap B}{A \cup B} = \frac{A \cap B}{|A| + |B| - A \cap  B}

    .. math::
        GIoU(A, B) = IoU(A, B) - \frac{C - U}{C}

    where, A and B represents the two convex shapes. In here, it means the predict box and the groundtruth box; C is defined as the smallest convex
    shapes enclosing both A and B; U represents the union area :math:`|A| + |B| - A \cap  B`

    In implementation, it calls the :class:`.IOULoss` with :attr:`loss_type="giou"`.
    """
    return IOULoss(loss_type="giou")


[docs]def DIOULoss():
    r"""The DIOU Loss is used to calculate the localization loss in object detection task.

    Distance IoU Loss is introduce by [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287v1) and can be described as:

    .. math::
        IoU(A, B) = \frac{A \cap B}{A \cup B} = \frac{A \cap B}{|A| + |B| - A \cap  B}

    .. math::
        DIoU(A, B) = IoU(A, B) - \frac{diag_{inter}}{diag_{outer}}

    where, A and B represents the two convex shapes. In here, it means the predict box and the groundtruth box; :math:`diag_{inter}` is defined as center distance between 
    A and B; :math:`diag_{outer}` is the diagonal length of the smallest enclosing box covering the two boxes.

    In implementation, it calls the :class:`.IOULoss` with :attr:`loss_type="diou"`.
    """
    return IOULoss(loss_type="diou")


[docs]def CIOULoss():
    r"""The CIOU Loss is used to calculate the localization loss in object detection task.

    Complete IoU Loss is introduce by [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287v1) and can be described as:

    .. math::
        IoU(A, B) = \frac{A \cap B}{A \cup B} = \frac{A \cap B}{|A| + |B| - A \cap  B}

    .. math::
        DIoU(A, B) = IoU(A, B) - \frac{diag_{inter}}{diag_{outer}}

    .. math::
        CIoU(A, B) = DIoU(A, B) - \alpha v
    
    where, A and B represents the two convex shapes. In here, it means the predict box and the groundtruth box; :math:`\alpha = \frac{v}{(1-IoU(A,B))+v}` 
    and :math:`v = \frac{4}{\pi^2} (arctan \frac{w^A}{h^A} − arctan \frac{w^B}{h^B})^2` is used to impose the consistency of aspect ratio.

    In CIoU loss, the :math:`\alpha` part is not used for backpropagation.

    In implementation, it calls the :class:`.IOULoss` with :attr:`loss_type="ciou"`.
    """
    return IOULoss(loss_type="ciou")


if __name__ == "__main__":
    iou = IOULoss()
    giou = GIOULoss()
    diou = DIOULoss()

    box = torch.tensor([[[0.0, 0.0, 0.5, 0.5]]])
    box[:, :, 2:] = torch.log(box[:, :, 2:])
    tar = torch.tensor([[[0, 0, 1.0, 1.0]]])
    tar[:, :, 2:] = torch.log(tar[:, :, 2:])

    print("IOU: ", iou(box, tar))
    print("GIOU: ", giou(box, tar))
    print("DIOU: ", diou(box, tar))