o
    (iu                     @  sV   d dl mZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 G dd dejZdS )    )annotations)OptionalN)CrossAttentionBlockMLPBlockSABlockc                      sD   e Zd ZdZ									d!d" fddZ	d#d$dd Z  ZS )%TransformerBlockz
    A transformer block, based on: "Dosovitskiy et al.,
    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
            FNThidden_sizeintmlp_dim	num_headsdropout_ratefloatqkv_biasbool	save_attncausalsequence_length
int | Nonewith_cross_attentionuse_flash_attention
include_fcuse_combined_linearreturnNonec                   s   t    d|  krdkstd td|| dkr!tdt|||| _t|| _t||||||||||
d
| _	t|| _
|	| _t|| _t||||d|
d| _dS )	a  
        Args:
            hidden_size (int): dimension of hidden layer.
            mlp_dim (int): dimension of feedforward layer.
            num_heads (int): number of attention heads.
            dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0.
            qkv_bias(bool, optional): apply bias term for the qkv linear layer. Defaults to False.
            save_attn (bool, optional): to make accessible the attention matrix. Defaults to False.
            use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
                (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
            include_fc: whether to include the final linear layer. Default to True.
            use_combined_linear: whether to use a single linear layer for qkv projection, default to True.

        r      z'dropout_rate should be between 0 and 1.z-hidden_size should be divisible by num_heads.)r   r   r   r   r   r   r   F)r	   r   r   r   r   r   N)super__init__
ValueErrorr   mlpnn	LayerNormnorm1r   attnnorm2r   norm_cross_attnr   
cross_attn)selfr	   r   r   r   r   r   r   r   r   r   r   r   	__class__ h/home/dell461/cl/sdc2/last_ska_mid/HISourceFinder-master-l/src/monai/networks/blocks/transformerblock.pyr      s@   
zTransformerBlock.__init__xtorch.TensorcontextOptional[torch.Tensor]	attn_maskc                 C  sN   || j | ||d }| jr|| j| ||d }|| | | }|S )N)r0   )r.   )r#   r"   r   r&   r%   r   r$   )r'   r,   r.   r0   r*   r*   r+   forward]   s
   zTransformerBlock.forward)	r   FFFNFFTT)r	   r
   r   r
   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )NN)r,   r-   r.   r/   r0   r/   r   r-   )__name__
__module____qualname____doc__r   r1   __classcell__r*   r*   r(   r+   r      s    
Br   )
__future__r   typingr   torchtorch.nnr    Zmonai.networks.blocksr   r   r   Moduler   r*   r*   r*   r+   <module>   s   