o
    i Ñ  ã                   @  s<  d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	Z	ddl	m
Z
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) e)ddd\Z*Z+e)ddd\Z,Z+G dd„ dej-ƒZ.			d*d+d(d)„Z/dS ),z{
Part of this script is adapted from
https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py
é    )ÚannotationsN)ÚCallableÚSequence)ÚAny)ÚTensorÚnn)Ú	RetinaNetÚresnet_fpn_feature_extractor)ÚAnchorGenerator)ÚATSSMatcher)ÚBoxCoder)ÚBoxSelector)Úcheck_training_targetsÚpreprocess_images)ÚHardNegativeSampler)Úensure_dict_value_to_list_Úpredict_with_inferer)Úbox_iou)ÚSlidingWindowInferer)Úresnet)Ú	BlendModeÚPytorchPadModeÚensure_tuple_repÚoptional_importz#torchvision.models.detection._utilsÚBalancedPositiveNegativeSampler)ÚnameÚMatcherc                
      sF  e Zd ZdZeddddddfd”‡ fdd„Zd•dd„Zd–dd „Zd—d#d$„Zd˜d&d'„Z	d™d+d,„Z
	-dšd›d2d3„Zdœdd8d9„Z	:dždŸd?d@„Zd dAdB„ZddCejdDejdEddddf
d¡dUdV„Z	W	X	C	Y	-d¢d£d_d`„Z		d¤d¥dgdh„Zdidj„ Zd¦dodp„Zd§dsdt„Z	-dšd¨d~d„Zd©d€d„Zdªd‚dƒ„Zd«d†d‡„Zd¬dˆd‰„Zd­dŽd„Zd®d’d“„Z‡  ZS )¯ÚRetinaNetDetectoraÿ  
    Retinanet detector, expandable to other one stage anchor based box detectors in the future.
    An example of construction can found in the source code of
    :func:`~monai.apps.detection.networks.retinanet_detector.retinanet_resnet50_fpn_detector` .

    The input to the model is expected to be a list of tensors, each of shape (C, H, W) or  (C, H, W, D),
    one for each image, and should be in 0-1 range. Different images can have different sizes.
    Or it can also be a Tensor sized (B, C, H, W) or  (B, C, H, W, D). In this case, all images have same size.

    The behavior of the model changes depending if it is in training or evaluation mode.

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:

    - boxes (``FloatTensor[N, 4]`` or ``FloatTensor[N, 6]``): the ground-truth boxes in ``StandardMode``, i.e.,
      ``[xmin, ymin, xmax, ymax]`` or ``[xmin, ymin, zmin, xmax, ymax, zmax]`` format,
      with ``0 <= xmin < xmax <= H``, ``0 <= ymin < ymax <= W``, ``0 <= zmin < zmax <= D``.
    - labels: the class label for each ground-truth box

    The model returns a Dict[str, Tensor] during training, containing the classification and regression
    losses.
    When saving the model, only self.network contains trainable parameters and needs to be saved.

    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:

    - boxes (``FloatTensor[N, 4]`` or ``FloatTensor[N, 6]``): the predicted boxes in ``StandardMode``, i.e.,
      ``[xmin, ymin, xmax, ymax]`` or ``[xmin, ymin, zmin, xmax, ymax, zmax]`` format,
      with ``0 <= xmin < xmax <= H``, ``0 <= ymin < ymax <= W``, ``0 <= zmin < zmax <= D``.
    - labels (Int64Tensor[N]): the predicted labels for each image
    - labels_scores (Tensor[N]): the scores for each prediction

    Args:
        network: a network that takes an image Tensor sized (B, C, H, W) or (B, C, H, W, D) as input
            and outputs a dictionary Dict[str, List[Tensor]] or Dict[str, Tensor].
        anchor_generator: anchor generator.
        box_overlap_metric: func that compute overlap between two sets of boxes, default is Intersection over Union (IoU).
        debug: whether to print out internal parameters, used for debugging and parameter tuning.

    Notes:

        Input argument ``network`` can be a monai.apps.detection.networks.retinanet_network.RetinaNet(*) object,
        but any network that meets the following rules is a valid input ``network``.

        1. It should have attributes including spatial_dims, num_classes, cls_key, box_reg_key, num_anchors, size_divisible.

            - spatial_dims (int) is the spatial dimension of the network, we support both 2D and 3D.
            - num_classes (int) is the number of classes, excluding the background.
            - size_divisible (int or Sequence[int]) is the expectation on the input image shape.
              The network needs the input spatial_size to be divisible by size_divisible, length should be 2 or 3.
            - cls_key (str) is the key to represent classification in the output dict.
            - box_reg_key (str) is the key to represent box regression in the output dict.
            - num_anchors (int) is the number of anchor shapes at each location. it should equal to
              ``self.anchor_generator.num_anchors_per_location()[0]``.

            If network does not have these attributes, user needs to provide them for the detector.

        2. Its input should be an image Tensor sized (B, C, H, W) or (B, C, H, W, D).

        3. About its output ``head_outputs``, it should be either a list of tensors or a dictionary of str: List[Tensor]:

            - If it is a dictionary, it needs to have at least two keys:
              ``network.cls_key`` and ``network.box_reg_key``, representing predicted classification maps and box regression maps.
              ``head_outputs[network.cls_key]`` should be List[Tensor] or Tensor. Each Tensor represents
              classification logits map at one resolution level,
              sized (B, num_classes*num_anchors, H_i, W_i) or (B, num_classes*num_anchors, H_i, W_i, D_i).
              ``head_outputs[network.box_reg_key]`` should be List[Tensor] or Tensor. Each Tensor represents
              box regression map at one resolution level,
              sized (B, 2*spatial_dims*num_anchors, H_i, W_i)or (B, 2*spatial_dims*num_anchors, H_i, W_i, D_i).
              ``len(head_outputs[network.cls_key]) == len(head_outputs[network.box_reg_key])``.
            - If it is a list of 2N tensors, the first N tensors should be the predicted classification maps,
              and the second N tensors should be the predicted box regression maps.

    Example:

        .. code-block:: python

            # define a naive network
            import torch
            class NaiveNet(torch.nn.Module):
                def __init__(self, spatial_dims: int, num_classes: int):
                    super().__init__()
                    self.spatial_dims = spatial_dims
                    self.num_classes = num_classes
                    self.size_divisible = 2
                    self.cls_key = "cls"
                    self.box_reg_key = "box_reg"
                    self.num_anchors = 1
                def forward(self, images: torch.Tensor):
                    spatial_size = images.shape[-self.spatial_dims:]
                    out_spatial_size = tuple(s//self.size_divisible for s in spatial_size)  # half size of input
                    out_cls_shape = (images.shape[0],self.num_classes*self.num_anchors) + out_spatial_size
                    out_box_reg_shape = (images.shape[0],2*self.spatial_dims*self.num_anchors) + out_spatial_size
                    return {self.cls_key: [torch.randn(out_cls_shape)], self.box_reg_key: [torch.randn(out_box_reg_shape)]}

            # create a RetinaNetDetector detector
            spatial_dims = 3
            num_classes = 5
            anchor_generator = monai.apps.detection.utils.anchor_utils.AnchorGeneratorWithAnchorShape(
                feature_map_scales=(1, ), base_anchor_shapes=((8,) * spatial_dims)
            )
            net = NaiveNet(spatial_dims, num_classes)
            detector = RetinaNetDetector(net, anchor_generator)

            # only detector.network may contain trainable parameters.
            optimizer = torch.optim.SGD(
                detector.network.parameters(),
                1e-3,
                momentum=0.9,
                weight_decay=3e-5,
                nesterov=True,
            )
            torch.save(detector.network.state_dict(), 'model.pt')  # save model
            detector.network.load_state_dict(torch.load('model.pt', weights_only=True))  # load model
    Né   ÚclassificationÚbox_regressionFÚnetworkú	nn.ModuleÚanchor_generatorr
   Úbox_overlap_metricr   Úspatial_dimsú
int | NoneÚnum_classesÚsize_divisibleúSequence[int] | intÚcls_keyÚstrÚbox_reg_keyÚdebugÚboolc
                   sR  t ƒ  ¡  || _| jd|d| _| jd|d| _| jd|d| _t| j| jƒ| _| jd|d| _| jd|d| _	|| _
| j
 ¡ d | _| jd| jd}
| j|
kr\td	|
› d
| j› dƒ‚d | _d | _|| _|	| _d | _|  tjjdd¡ | jtjjdddddd td| j d| _d| _d| _| jd | _d | _t| jdddddd| _ d S )Nr%   )Údefault_valuer'   r(   r*   r,   r   Únum_anchorsz Number of feature map channels (z8) should match with number of anchors at each location (z).Úmean)Ú	reductiongÇqÇq¼?)Úbetar2   TF)Ú	encode_gtÚdecode_pred)ç      ð?r6   ©ÚweightsÚboxesÚlabelsÚ_scoresçš™™™™™©?éè  ç      à?é,  )r$   Úscore_threshÚtopk_candidates_per_levelÚ
nms_threshÚdetections_per_imgÚapply_sigmoid)!ÚsuperÚ__init__r!   Úget_attribute_from_networkr%   r'   r(   r   r*   r,   r#   Únum_anchors_per_locationÚnum_anchors_per_locÚ
ValueErrorÚanchorsÚprevious_image_shaper$   r-   Úfg_bg_samplerÚset_cls_lossÚtorchr   ÚBCEWithLogitsLossÚset_box_regression_lossÚSmoothL1Lossr   Ú	box_coderÚtarget_box_keyÚtarget_label_keyÚpred_score_keyÚinfererr   Úbox_selector)Úselfr!   r#   r$   r%   r'   r(   r*   r,   r-   Znetwork_num_anchors©Ú	__class__© úr/home/dell461/cl/sdc2/last_ska_mid/HISourceFinder-master-l/src/monai/apps/detection/networks/retinanet_detector.pyrF   º   sN   

ÿÿÿúzRetinaNetDetector.__init__c                 C  s4   t | j|ƒrt| j|ƒS |d ur|S td|› dƒ‚)Nz network does not have attribute z$, please provide it in the detector.)Úhasattrr!   ÚgetattrrJ   )rY   Ú	attr_namer/   r\   r\   r]   rG     s
   z,RetinaNetDetector.get_attribute_from_networkr8   útuple[float]ÚreturnÚNonec                 C  s>   t |ƒd| j krtdd| j › d|› dƒ‚t|d| _dS )z…
        Set the weights for box coder.

        Args:
            weights: a list/tuple with length of 2*self.spatial_dims

        é   zlen(weights) should be z, got weights=Ú.r7   N)Úlenr%   rJ   r   rS   )rY   r8   r\   r\   r]   Úset_box_coder_weights
  s   z'RetinaNetDetector.set_box_coder_weightsÚbox_keyÚ	label_keyc                 C  s   || _ || _|d | _dS )aB  
        Set keys for the training targets and inference outputs.
        During training, both box_key and label_key should be keys in the targets
        when performing ``self.forward(input_images, targets)``.
        During inference, they will be the keys in the output dict of `self.forward(input_images)``.
        r;   N)rT   rU   rV   )rY   rh   ri   r\   r\   r]   Úset_target_keys  s   z!RetinaNetDetector.set_target_keysÚcls_lossc                 C  s
   || _ dS )a¡  
        Using for training. Set loss for classification that takes logits as inputs, make sure sigmoid/softmax is built in.

        Args:
            cls_loss: loss module for classification

        Example:
            .. code-block:: python

                detector.set_cls_loss(torch.nn.BCEWithLogitsLoss(reduction="mean"))
                detector.set_cls_loss(FocalLoss(reduction="mean", gamma=2.0))
        N)Úcls_loss_func)rY   rk   r\   r\   r]   rN   !  s   
zRetinaNetDetector.set_cls_lossÚbox_lossr4   r5   c                 C  s   || _ || _|| _dS )a¿  
        Using for training. Set loss for box regression.

        Args:
            box_loss: loss module for box regression
            encode_gt: if True, will encode ground truth boxes to target box regression
                before computing the losses. Should be True for L1 loss and False for GIoU loss.
            decode_pred: if True, will decode predicted box regression into predicted boxes
                before computing losses. Should be False for L1 loss and True for GIoU loss.

        Example:
            .. code-block:: python

                detector.set_box_regression_loss(
                    torch.nn.SmoothL1Loss(beta=1.0 / 9, reduction="mean"),
                    encode_gt = True, decode_pred = False
                )
                detector.set_box_regression_loss(
                    monai.losses.giou_loss.BoxGIoULoss(reduction="mean"),
                    encode_gt = False, decode_pred = True
                )
        N)Úbox_loss_funcr4   r5   )rY   rm   r4   r5   r\   r\   r]   rQ   0  s   
z)RetinaNetDetector.set_box_regression_lossTÚfg_iou_threshÚfloatÚbg_iou_threshÚallow_low_quality_matchesc                 C  s2   ||k rt d|› d|› dƒ‚t|||d| _dS )aò  
        Using for training. Set torchvision matcher that matches anchors with ground truth boxes.

        Args:
            fg_iou_thresh: foreground IoU threshold for Matcher, considered as matched if IoU > fg_iou_thresh
            bg_iou_thresh: background IoU threshold for Matcher, considered as not matched if IoU < bg_iou_thresh
            allow_low_quality_matches: if True, produce additional matches
                for predictions that have only low-quality match candidates.
        z:Require fg_iou_thresh >= bg_iou_thresh. Got fg_iou_thresh=z, bg_iou_thresh=re   )rr   N)rJ   r   Úproposal_matcher)rY   ro   rq   rr   r\   r\   r]   Úset_regular_matcherK  s   ÿÿÿÿz%RetinaNetDetector.set_regular_matcheré   Únum_candidatesÚintÚcenter_in_gtc                 C  s   t || j|| jd| _dS )a&  
        Using for training. Set ATSS matcher that matches anchors with ground truth boxes

        Args:
            num_candidates: number of positions to select candidates from.
                Smaller value will result in a higher matcher threshold and less matched candidates.
            center_in_gt: If False (default), matched anchor center points do not need
                to lie within the ground truth box. Recommend False for small objects.
                If True, will result in a strict matcher and less matched candidates.
        )r-   N)r   r$   r-   rs   )rY   rv   rx   r\   r\   r]   Úset_atss_matcher`  s   z"RetinaNetDetector.set_atss_matcheré
   Úbatch_size_per_imageÚpositive_fractionÚmin_negÚ	pool_sizec                 C  s   t ||||d| _dS )aœ  
        Using for training. Set hard negative sampler that samples part of the anchors for training.

        HardNegativeSampler is used to suppress false positive rate in classification tasks.
        During training, it select negative samples with high prediction scores.

        Args:
            batch_size_per_image: number of elements to be selected per image
            positive_fraction: percentage of positive elements in the selected samples
            min_neg: minimum number of negative samples to select if possible.
            pool_size: when we need ``num_neg`` hard negative samples, they will be randomly selected from
                ``num_neg * pool_size`` negative samples with the highest prediction scores.
                Larger ``pool_size`` gives more randomness, yet selects negative samples that are less 'hard',
                i.e., negative samples with lower prediction scores.
        )r{   r|   r}   r~   N)r   rM   )rY   r{   r|   r}   r~   r\   r\   r]   Úset_hard_negative_samplerm  s   üz+RetinaNetDetector.set_hard_negative_samplerc                 C  s   t ||d| _dS )a  
        Using for training. Set torchvision balanced sampler that samples part of the anchors for training.

        Args:
            batch_size_per_image: number of elements to be selected per image
            positive_fraction: percentage of positive elements per batch

        )r{   r|   N)r   rM   )rY   r{   r|   r\   r\   r]   Úset_balanced_sampler†  s   	ÿz&RetinaNetDetector.set_balanced_samplerr>   g      À?ç        Úroi_sizeÚsw_batch_sizeÚoverlapÚmodeúBlendMode | strÚsigma_scaleúSequence[float] | floatÚpadding_modeúPytorchPadMode | strÚcvalÚ	sw_deviceútorch.device | str | NoneÚdeviceÚprogressÚcache_roi_weight_mapc                 C  s"   t |||||||||	|
|ƒ| _dS )zM
        Define sliding window inferer and store it to self.inferer.
        N)r   rW   )rY   r‚   rƒ   r„   r…   r‡   r‰   r‹   rŒ   rŽ   r   r   r\   r\   r]   Úset_sliding_window_inferer“  s   
õz,RetinaNetDetector.set_sliding_window_infererr<   r=   r?   r@   rA   rB   rC   rD   c                 C  s   t | j|||||d| _dS )aW  
        Using for inference. Set the parameters that are used for box selection during inference.
        The box selection is performed with the following steps:

        #. For each level, discard boxes with scores less than self.score_thresh.
        #. For each level, keep boxes with top self.topk_candidates_per_level scores.
        #. For the whole image, perform non-maximum suppression (NMS) on boxes, with overlapping threshold nms_thresh.
        #. For the whole image, keep boxes with top self.detections_per_img scores.

        Args:
            score_thresh: no box with scores less than score_thresh will be kept
            topk_candidates_per_level: max number of boxes to keep for each level
            nms_thresh: box overlapping threshold for NMS
            detections_per_img: max number of boxes to keep for each image
        )r$   rD   r@   rA   rB   rC   N)r   r$   rX   )rY   r@   rA   rB   rC   rD   r\   r\   r]   Úset_box_selector_parameters²  s   úz-RetinaNetDetector.set_box_selector_parametersÚinput_imagesúlist[Tensor] | TensorÚtargetsúlist[dict[str, Tensor]] | NoneÚuse_infererú+dict[str, Tensor] | list[dict[str, Tensor]]c                 C  sF  | j rt||| j| j| jƒ}|  ¡  t|| j| jƒ\}}| j s!|sQ|  |¡}t	|t
tfƒrLi }|dt|ƒd … || j< |t|ƒd d… || j< |}nt|ƒ n| jdu rZtdƒ‚t|| j| j| jg| jd}|  ||¡ dd„ || j D ƒ}| j| jfD ]}	|  ||	 ¡||	< q~| j r˜|  ||| j|¡}
|
S |  || j||¡}|S )a  
        Returns a dict of losses during training, or a list predicted dict of boxes and labels during inference.

        Args:
            input_images: The input to the model is expected to be a list of tensors, each of shape (C, H, W) or  (C, H, W, D),
                one for each image, and should be in 0-1 range. Different images can have different sizes.
                Or it can also be a Tensor sized (B, C, H, W) or  (B, C, H, W, D). In this case, all images have same size.
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image (optional).
            use_inferer: whether to use self.inferer, a sliding window inferer, to do the inference.
                If False, will simply forward the network.
                If True, will use self.inferer, and requires
                ``self.set_sliding_window_inferer(*args)`` to have been called before.

        Return:
            If training mode, will return a dict with at least two keys,
            including self.cls_key and self.box_reg_key, representing classification loss and box regression loss.

            If evaluation mode, will return a list of detection results.
            Each element corresponds to an images in ``input_images``, is a dict with at least three keys,
            including self.target_box_key, self.target_label_key, self.pred_score_key,
            representing predicted boxes, classification labels, and classification scores.

        Nrd   zZ`self.inferer` is not defined.Please refer to function self.set_sliding_window_inferer(*).)ÚkeysrW   c                 S  s   g | ]}|j d d…  ¡ ‘qS )rd   N)ÚshapeÚnumel)Ú.0Úxr\   r\   r]   Ú
<listcomp>  s    z-RetinaNetDetector.forward.<locals>.<listcomp>)Útrainingr   r%   rU   rT   Ú#_check_detector_training_componentsr   r(   r!   Ú
isinstanceÚtupleÚlistrf   r*   r,   r   rW   rJ   r   Úgenerate_anchorsÚ_reshape_mapsÚcompute_lossrK   Úpostprocess_detections)rY   r“   r•   r—   ÚimagesÚimage_sizesÚhead_outputsZtmp_dictÚnum_anchor_locs_per_levelÚkeyÚlossesÚ
detectionsr\   r\   r]   ÚforwardÓ  s@   ÿ



ÿÿ
ÿzRetinaNetDetector.forwardc                 C  s8   t | dƒs	tdƒ‚| jdu r| jrt d¡ dS dS dS )zc
        Check if self.proposal_matcher and self.fg_bg_sampler have been set for training.
        rs   z\Matcher is not set. Please refer to self.set_regular_matcher(*) or self.set_atss_matcher(*).Na  No balanced sampler is used. Negative samples are likely to be much more than positive samples. Please set balanced samplers with self.set_balanced_sampler(*) or self.set_hard_negative_sampler(*), or set classification loss function as Focal loss with self.set_cls_loss(*))r^   ÚAttributeErrorrM   r-   ÚwarningsÚwarn©rY   r\   r\   r]   r    (  s   
ÿÿÿz5RetinaNetDetector._check_detector_training_componentsr¨   r   rª   údict[str, list[Tensor]]c                 C  s:   | j du s| j|jkr|  ||| j ¡| _ |j| _dS dS )aA  
        Generate anchors and store it in self.anchors: List[Tensor].
        We generate anchors only when there is no stored anchors,
        or the new coming images has different shape with self.previous_image_shape

        Args:
            images: input images, a (B, C, H, W) or (B, C, H, W, D) Tensor.
            head_outputs: head_outputs. ``head_output_reshape[self.cls_key]`` is a Tensor
              sized (B, sum(HW(D)A), self.num_classes). ``head_output_reshape[self.box_reg_key]`` is a Tensor
              sized (B, sum(HW(D)A), 2*self.spatial_dims)
        N)rK   rL   rš   r#   r*   )rY   r¨   rª   r\   r\   r]   r¤   8  s   þz"RetinaNetDetector.generate_anchorsÚresult_mapsúlist[Tensor]c           	   	   C  sô   g }|D ]n}|j d }|j d | j }|j | j d… }|d|f| }| |¡}| jdkr7| ddddd¡}n| jdkrG| dddddd¡}ntd	ƒ‚| |d|¡}t |¡ 	¡ s`t 
|¡ 	¡ rmt ¡ rhtd
ƒ‚t d
¡ | |¡ qtj|ddS )a¶  
        Concat network output map list to a single Tensor.
        This function is used in both training and inference.

        Args:
            result_maps: a list of Tensor, each Tensor is a (B, num_channel*A, H, W) or (B, num_channel*A, H, W, D) map.
                A = self.num_anchors_per_loc

        Return:
            reshaped and concatenated result, sized (B, sum(HWA), num_channel) or (B, sum(HWDA), num_channel)
        r   r   Néÿÿÿÿrd   é   ru   é   zImages can only be 2D or 3D.z"Concatenated result is NaN or Inf.©Údim)rš   rI   r%   ÚviewÚpermuterJ   ÚreshaperO   ÚisnanÚanyÚisinfÚis_grad_enabledr±   r²   ÚappendÚcat)	rY   rµ   Zall_reshaped_result_mapÚ
result_mapÚ
batch_sizeZnum_channelÚspatial_sizeÚ
view_shapeZreshaped_result_mapr\   r\   r]   r¥   H  s&   




zRetinaNetDetector._reshape_mapsÚhead_outputs_reshapeúdict[str, Tensor]rK   r©   úlist[list[int]]r«   úSequence[int]Úneed_sigmoidúlist[dict[str, Tensor]]c              	     s
  ‡fdd„|D ƒ‰i }|D ]}t || jˆddƒ||< q‡fdd„|D ƒ}|ˆj }	|ˆj }
|	d j‰ t|ƒ}g }t|ƒD ]C‰‡fdd„|
D ƒ}‡fdd„|	D ƒ}|ˆ |ˆ }}‡ ‡fd	d„t||ƒD ƒ}ˆj 	|||¡\}}}| 
ˆj|ˆj|ˆj|i¡ q?|S )
a¹  
        Postprocessing to generate detection result from classification logits and box regression.
        Use self.box_selector to select the final output boxes for each image.

        Args:
            head_outputs_reshape: reshaped head_outputs. ``head_output_reshape[self.cls_key]`` is a Tensor
              sized (B, sum(HW(D)A), self.num_classes). ``head_output_reshape[self.box_reg_key]`` is a Tensor
              sized (B, sum(HW(D)A), 2*self.spatial_dims)
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            anchors: a list of Tensor. Each Tensor represents anchors for each image,
                sized (sum(HWA), 2*spatial_dims) or (sum(HWDA), 2*spatial_dims).
                A = self.num_anchors_per_loc.

        Return:
            a list of dict, each dict corresponds to detection result on image.
        c                   s   g | ]}|ˆ j  ‘qS r\   )rI   )rœ   Znum_anchor_locsr³   r\   r]   rž     s    
ÿz<RetinaNetDetector.postprocess_detections.<locals>.<listcomp>r   rº   c                   s   g | ]	}t | ˆ ¡ƒ‘qS r\   )r£   Úsplit)rœ   Úa)Únum_anchors_per_levelr\   r]   rž   ˜  s    r   c                   ó   g | ]}|ˆ  ‘qS r\   r\   )rœ   Úbr©Úindexr\   r]   rž   £  s    ÿc                   rÒ   r\   r\   )rœ   ÚclrÔ   r\   r]   rž   ¦  s    c                   s,   g | ]\}}ˆj  | tj¡|¡ ˆ ¡‘qS r\   )rS   Údecode_singleÚtorO   Úfloat32)rœ   ÚbrÐ   )Úcompute_dtyperY   r\   r]   rž   ©  s    ÿÿ)r£   rÏ   r*   r,   Údtyperf   ÚrangeÚziprX   Zselect_boxes_per_imagerÃ   rT   rV   rU   )rY   rÉ   rK   r©   r«   rÍ   Úsplit_head_outputsÚkÚsplit_anchorsÚclass_logitsr    Ú
num_imagesr®   Úbox_regression_per_imageÚlogits_per_imageÚanchors_per_imageZimg_spatial_sizeÚboxes_per_imageZselected_boxesZselected_scoresZselected_labelsr\   )rÛ   rÕ   rÑ   rY   r]   r§   u  s>   
ÿ



ÿþ
ÿýÿz(RetinaNetDetector.postprocess_detectionsc                 C  sH   |   |||¡}|  || j ||¡}|  || j |||¡}| j|| j|iS )aþ  
        Compute losses.

        Args:
            head_outputs_reshape: reshaped head_outputs. ``head_output_reshape[self.cls_key]`` is a Tensor
              sized (B, sum(HW(D)A), self.num_classes). ``head_output_reshape[self.box_reg_key]`` is a Tensor
              sized (B, sum(HW(D)A), 2*self.spatial_dims)
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            anchors: a list of Tensor. Each Tensor represents anchors for each image,
                sized (sum(HWA), 2*spatial_dims) or (sum(HWDA), 2*spatial_dims).
                A = self.num_anchors_per_loc.

        Return:
            a dict of several kinds of losses.
        )Úcompute_anchor_matched_idxsÚcompute_cls_lossr*   Úcompute_box_lossr,   )rY   rÉ   r•   rK   r«   Úmatched_idxsZ
losses_clsZlosses_box_regressionr\   r\   r]   r¦   ¼  s   ÿzRetinaNetDetector.compute_lossc           	   	   C  s  g }t ||ƒD ]€\}}|| j  ¡ dkr'| tj| d¡fdtj|jd¡ qt	| j
tƒr@|  || j  |j¡|¡}|  
|¡}nt	| j
tƒrY|  
|| j  |j¡||| j¡\}}ntdƒ‚| jrotdtj|ddd › dƒ t |¡dk r‚t d	|| j › d¡ | |¡ q|S )
a  
        Compute the matched indices between anchors and ground truth (gt) boxes in targets.
        output[k][i] represents the matched gt index for anchor[i] in image k.
        Suppose there are M gt boxes for image k. The range of it output[k][i] value is [-2, -1, 0, ..., M-1].
        [0, M - 1] indicates this anchor is matched with a gt box,
        while a negative value indicating that it is not matched.

        Args:
            anchors: a list of Tensor. Each Tensor represents anchors for each image,
                sized (sum(HWA), 2*spatial_dims) or (sum(HWDA), 2*spatial_dims).
                A = self.num_anchors_per_loc.
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            num_anchor_locs_per_level: each element represents HW or HWD at this level.


        Return:
            a list of matched index `matched_idxs_per_image` (Tensor[int64]), Tensor sized (sum(HWA),) or (sum(HWDA),).
            Suppose there are M gt boxes. `matched_idxs_per_image[i]` is a matched gt index in [0, M - 1]
            or a negative value indicating that anchor i could not be matched.
            BELOW_LOW_THRESHOLD = -1, BETWEEN_THRESHOLDS = -2
        r   r·   )rÜ   rŽ   z­Currently support torchvision Matcher and monai ATSS matcher. Other types of matcher not supported. Please override self.compute_anchor_matched_idxs(*) for your own matcher.z.Max box overlap between anchors and gt boxes: r   rº   re   z´No anchor is matched with GT boxes. Please adjust matcher setting, anchor setting, or the network setting to change zoom scale between network output and input images.GT boxes are )rÞ   rT   r›   rÃ   rO   ÚfullÚsizeÚint64rŽ   r¡   rs   r   r$   rØ   r   rI   ÚNotImplementedErrorr-   ÚprintÚmaxr±   r²   )	rY   rK   r•   r«   rë   ræ   Útargets_per_imageÚmatch_quality_matrixÚmatched_idxs_per_imager\   r\   r]   rè   Ú  s@   ÿÿ
üÿþÿz-RetinaNetDetector.compute_anchor_matched_idxsÚ
cls_logitsrë   c                 C  sz   g }g }t |||ƒD ]\}}}|  |||¡\}	}
| |	¡ | |
¡ q
tj|dd}tj|dd}|  ||¡ |j¡}|S )a×  
        Compute classification losses.

        Args:
            cls_logits: classification logits, sized (B, sum(HW(D)A), self.num_classes)
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            matched_idxs: a list of matched index. each element is sized (sum(HWA),) or  (sum(HWDA),)

        Return:
            classification losses.
        r   rº   )rÞ   Úget_cls_train_sample_per_imagerÃ   rO   rÄ   rl   rØ   rÜ   )rY   rõ   r•   rë   Ztotal_cls_logits_listZtotal_gt_classes_target_listrò   Úcls_logits_per_imagerô   Zsampled_cls_logits_per_imageZsampled_gt_classes_targetZtotal_cls_logitsZtotal_gt_classes_targetr­   r\   r\   r]   ré   "  s   ÿ
z"RetinaNetDetector.compute_cls_lossc                 C  sœ   g }g }t ||||ƒD ]\}}}	}
|  |||	|
¡\}}| |¡ | |¡ qtj|dd}tj|dd}|jd dkrBt d¡}|S |  ||¡ |j	¡}|S )a±  
        Compute box regression losses.

        Args:
            box_regression: box regression results, sized (B, sum(HWA), 2*self.spatial_dims)
            targets: a list of dict. Each dict with two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            anchors: a list of Tensor. Each Tensor represents anchors for each image,
                sized (sum(HWA), 2*spatial_dims) or (sum(HWDA), 2*spatial_dims).
                A = self.num_anchors_per_loc.
            matched_idxs: a list of matched index. each element is sized (sum(HWA),) or  (sum(HWDA),)

        Return:
            box regression losses.
        r   rº   r   )
rÞ   Úget_box_train_sample_per_imagerÃ   rO   rÄ   rš   Útensorrn   rØ   rÜ   )rY   r    r•   rK   rë   Ztotal_box_regression_listZtotal_target_regression_listrò   rä   ræ   rô   Zdecode_box_regression_per_imageÚmatched_gt_boxes_per_imageZtotal_box_regressionZtotal_target_regressionr­   r\   r\   r]   rê   @  s"   ÿÿ

z"RetinaNetDetector.compute_box_lossr÷   rò   rô   útuple[Tensor, Tensor]c                 C  s¦  t  |¡ ¡ st  |¡ ¡ rt  ¡ rtdƒ‚t d¡ |dk}t| 	¡ ƒ}|| j
 jd }| jrPtd|› d|› dƒ |dkrP|d| k rPtd|› d|› d	ƒ t  |¡}d
|||| j ||  f< | jdu rn|| jjk}nUt| jtƒrt j| t j¡ddd }	|  |d g|	¡\}
}nt| jtƒrž|  |d g¡\}
}ntdƒ‚t  t j|
dd¡d }t  t j|dd¡d }t j||gdd}||dd…f ||dd…f fS )a;  
        Get samples from one image for classification losses computation.

        Args:
            cls_logits_per_image: classification logits for one image, (sum(HWA), self.num_classes)
            targets_per_image: a dict with at least two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            matched_idxs_per_image: matched index, Tensor sized (sum(HWA),) or (sum(HWDA),)
                Suppose there are M gt boxes. matched_idxs_per_image[i] is a matched gt index in [0, M - 1]
                or a negative value indicating that anchor i could not be matched.
                BELOW_LOW_THRESHOLD = -1, BETWEEN_THRESHOLDS = -2

        Return:
            paired predicted and GT samples from one image for classification losses computation
        z.NaN or Inf in predicted classification logits.r   z&Number of positive (matched) anchors: z; Number of GT box: re   rd   zOnly z anchors are matched with zš GT boxes. Please consider adjusting matcher setting, anchor setting, or the network setting to change zoom scale between network output and input images.r6   Nr   rº   z×Currently support torchvision BalancedPositiveNegativeSampler and monai HardNegativeSampler matcher. Other types of sampler not supported. Please override self.get_cls_train_sample_per_image(*) for your own sampler.)rO   r¿   rÀ   rÁ   rÂ   rJ   r±   r²   rw   ÚsumrT   rš   r-   rð   Ú
zeros_likerU   rM   rs   ÚBETWEEN_THRESHOLDSr¡   r   rñ   rØ   rÙ   r   rï   ÚwhererÄ   )rY   r÷   rò   rô   Úforeground_idxs_per_imageÚnum_foregroundÚ
num_gt_boxÚgt_classes_targetÚvalid_idxs_per_imageZmax_cls_logits_per_imageZsampled_pos_inds_listZsampled_neg_inds_listÚsampled_pos_indsÚsampled_neg_indsr\   r\   r]   rö   o  sL   
ÿ
ûÿÿÿ
	

ÿÿ z0RetinaNetDetector.get_cls_train_sample_per_imagerä   ræ   c           
      C  sú   t  |¡ ¡ st  |¡ ¡ rt  ¡ rtdƒ‚t d¡ t  |dk¡d }|| j	 j
d }|dkrD|dd…dd…f |dd…dd…f fS || j	 ||   |j¡}||dd…f }||dd…f }|}|}	| jro| j ||¡}| jry| j |	|¡}	|	|fS )aï  
        Get samples from one image for box regression losses computation.

        Args:
            box_regression_per_image: box regression result for one image, (sum(HWA), 2*self.spatial_dims)
            targets_per_image: a dict with at least two keys: self.target_box_key and self.target_label_key,
                ground-truth boxes present in the image.
            anchors_per_image: anchors of one image,
                sized (sum(HWA), 2*spatial_dims) or (sum(HWDA), 2*spatial_dims).
                A = self.num_anchors_per_loc.
            matched_idxs_per_image: matched index, sized (sum(HWA),) or  (sum(HWDA),)

        Return:
            paired predicted and GT samples from one image for box regression losses computation
        z'NaN or Inf in predicted box regression.r   N)rO   r¿   rÀ   rÁ   rÂ   rJ   r±   r²   rÿ   rT   rš   rØ   rŽ   r4   rS   Úencode_singler5   r×   )
rY   rä   rò   ræ   rô   r   r  rú   Zmatched_gt_boxes_per_image_Zbox_regression_per_image_r\   r\   r]   rø   ¾  s,   
(ÿþz0RetinaNetDetector.get_box_train_sample_per_image)r!   r"   r#   r
   r$   r   r%   r&   r'   r&   r(   r)   r*   r+   r,   r+   r-   r.   )N)r8   ra   rb   rc   )rh   r+   ri   r+   rb   rc   )rk   r"   rb   rc   )rm   r"   r4   r.   r5   r.   rb   rc   )T)ro   rp   rq   rp   rr   r.   rb   rc   )ru   F)rv   rw   rx   r.   rb   rc   )r   rz   )
r{   rw   r|   rp   r}   rw   r~   rp   rb   rc   )r{   rw   r|   rp   rb   rc   )r‚   r)   rƒ   rw   r„   rp   r…   r†   r‡   rˆ   r‰   rŠ   r‹   rp   rŒ   r   rŽ   r   r   r.   r   r.   rb   rc   )r<   r=   r>   r?   T)r@   rp   rA   rw   rB   rp   rC   rw   rD   r.   rb   rc   )NF)r“   r”   r•   r–   r—   r.   rb   r˜   )r¨   r   rª   r´   rb   rc   )rµ   r¶   rb   r   )rÉ   rÊ   rK   r¶   r©   rË   r«   rÌ   rÍ   r.   rb   rÎ   )
rÉ   rÊ   r•   rÎ   rK   r¶   r«   rÌ   rb   rÊ   )rK   r¶   r•   rÎ   r«   rÌ   rb   r¶   )rõ   r   r•   rÎ   rë   r¶   rb   r   )
r    r   r•   rÎ   rK   r¶   rë   r¶   rb   r   )r÷   r   rò   rÊ   rô   r   rb   rû   )
rä   r   rò   rÊ   ræ   r   rô   r   rb   rû   ) Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   rF   rG   rg   rj   rN   rQ   rt   ry   r   r€   r   ÚCONSTANTr   r‘   r’   r¯   r    r¤   r¥   r§   r¦   rè   ré   rê   rö   rø   Ú__classcell__r\   r\   rZ   r]   r   D   sh    yö
H



ÿÿ
ô!ú$üU

3ú
G

H

/Or   ©r   rd   r¸   FTr'   rw   r#   r
   Úreturned_layersrÌ   Ú
pretrainedr.   r   Úkwargsr   rb   c                   sr   t j||fi |¤Ž}t|jjƒ}t|||dˆ d}| ¡ d }	‡ fdd„|jjjD ƒ}
t|| |	||
d}t	||ƒS )aX  
    Returns a RetinaNet detector using a ResNet-50 as backbone, which can be pretrained
    from `Med3D: Transfer Learning for 3D Medical Image Analysis <https://arxiv.org/pdf/1904.00625.pdf>`
    _.

    Args:
        num_classes: number of output classes of the model (excluding the background).
        anchor_generator: AnchorGenerator,
        returned_layers: returned layers to extract feature maps. Each returned layer should be in the range [1,4].
            len(returned_layers)+1 will be the number of extracted feature maps.
            There is an extra maxpooling layer LastLevelMaxPool() appended.
        pretrained: If True, returns a backbone pre-trained on 23 medical datasets
        progress: If True, displays a progress bar of the download to stderr

    Return:
        A RetinaNetDetector object with resnet50 as backbone

    Example:

        .. code-block:: python

            # define a naive network
            resnet_param = {
                "pretrained": False,
                "spatial_dims": 3,
                "n_input_channels": 2,
                "num_classes": 3,
                "conv1_t_size": 7,
                "conv1_t_stride": (2, 2, 2)
            }
            returned_layers = [1]
            anchor_generator = monai.apps.detection.utils.anchor_utils.AnchorGeneratorWithAnchorShape(
                feature_map_scales=(1, 2), base_anchor_shapes=((8,) * resnet_param["spatial_dims"])
            )
            detector = retinanet_resnet50_fpn_detector(
                **resnet_param, anchor_generator=anchor_generator, returned_layers=returned_layers
            )
    N)Úbackboner%   Úpretrained_backboneÚtrainable_backbone_layersr  r   c                   s    g | ]}|d  d t ˆ ƒ  ‘qS )rd   )rñ   )rœ   Ús©r  r\   r]   rž   1  s     z3retinanet_resnet50_fpn_detector.<locals>.<listcomp>)r%   r'   r0   Úfeature_extractorr(   )
r   Úresnet50rf   Úconv1Ústrider	   rH   Úbodyr   r   )r'   r#   r  r  r   r  r  r%   r  r0   r(   r!   r\   r  r]   Úretinanet_resnet50_fpn_detector÷  s&   /ûû
r  )r  FT)r'   rw   r#   r
   r  rÌ   r  r.   r   r.   r  r   rb   r   )0r  Ú
__future__r   r±   Úcollections.abcr   r   Útypingr   rO   r   r   Z/monai.apps.detection.networks.retinanet_networkr   r	   Z'monai.apps.detection.utils.anchor_utilsr
   Z'monai.apps.detection.utils.ATSS_matcherr   Z$monai.apps.detection.utils.box_coderr   Z'monai.apps.detection.utils.box_selectorr   Z)monai.apps.detection.utils.detector_utilsr   r   Z0monai.apps.detection.utils.hard_negative_samplerr   Z(monai.apps.detection.utils.predict_utilsr   r   Úmonai.data.box_utilsr   Úmonai.inferersr   Úmonai.networks.netsr   Úmonai.utilsr   r   r   r   r   Ú_r   ÚModuler   r  r\   r\   r\   r]   Ú<module>   sF   "
ÿ       =û