
    Ǆg-                     V   d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlZd dlmc mZ d dlmZmZ ddlmZ ddlmZ dd	lmZ 	 d d
lmZ dZe G d d             Z G d dej@                        Z  G d dejB                        Z! G d dejD                        Z"d!dZ#ed        Z$ G d dejJ                        Z& G d dejJ                        Z' G d dejJ                        Z( G d dejJ                        Z) G d d ejJ                        Z*y# eeef$ r dZdZY w xY w)"    N)contextmanager)	dataclass)DictIterableOptionalTuple)Tensornn   )decode)detect_language)
transcribe)scaled_dot_product_attentionTFc                   r    e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   y)ModelDimensionsn_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerN)__name__
__module____qualname__int__annotations__     U/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/whisper/model.pyr   r      s;    KLOr"   r   c                   (     e Zd Zdedef fdZ xZS )	LayerNormxreturnc                 p    t         |   |j                               j                  |j                        S N)superforwardfloattypedtype)selfr&   	__class__s     r#   r+   zLayerNorm.forward(   s'    wqwwy)..qww77r"   )r   r   r   r	   r+   __classcell__r0   s   @r#   r%   r%   '   s    8 8F 8 8r"   r%   c                       e Zd ZdedefdZy)Linearr&   r'   c                     t        j                  || j                  j                  |j                        | j
                  d       S | j
                  j                  |j                              S r)   )Flinearweighttor.   bias)r/   r&   s     r#   r+   zLinear.forward-   sV    xxKKNN177#II%D
 	
 ,099<<+@
 	
r"   N)r   r   r   r	   r+   r!   r"   r#   r4   r4   ,   s    
 
F 
r"   r4   c                   6     e Zd Zdededee   def fdZ xZS )Conv1dr&   r8   r:   r'   c                     t         |   ||j                  |j                        |d       S |j                  |j                              S r)   )r*   _conv_forwardr9   r.   )r/   r&   r8   r:   r0   s       r#   r>   zConv1d._conv_forward6   sI     w$vyy!4<4
 	
=AWWQWW=M
 	
r"   )r   r   r   r	   r   r>   r1   r2   s   @r#   r<   r<   5   s2    

!'
/7/?
	
 
r"   r<   c                    |dz  dk(  sJ t        j                  |      |dz  dz
  z  }t        j                  | t        j                  |dz        z        }t        j                  |       ddt         j
                  f   |t         j
                  ddf   z  }t        j                  t        j                  |      t        j                  |      gd      S )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_times         r#   	sinusoidsrR   >   s    a<1 ff]3x1}q7HIYY 77%,,xST}:UUVN,,v&q"**}5rzzST}8UUK99eii,eii.DE1MMr"   c               #      K   t         j                  } 	 dt         _        d  | t         _        y # | t         _        w xY ww)NF)MultiHeadAttentionuse_sdpa)
prev_states    r#   disable_sdparW   G   s4     #,,J1&+#&0#j#s   ?/ ?<?c                        e Zd ZdZdedef fdZ	 	 	 ddedee   dee   dee   fd	Z		 dd
edededee   de
ej                  eej                     f   f
dZ xZS )rT   Tn_staten_headc                     t         |           || _        t        ||      | _        t        ||d      | _        t        ||      | _        t        ||      | _        y )NF)r:   )r*   __init__rZ   r4   querykeyvalueout)r/   rY   rZ   r0   s      r#   r\   zMultiHeadAttention.__init__T   sO    GW-
'77GW-
'7+r"   r&   xamaskkv_cachec                 .   | j                  |      }||| j                  |vr+| j                  ||n|      }| j                  ||n|      }n|| j                     }|| j                     }| j                  ||||      \  }}	| j	                  |      |	fS r)   )r]   r^   r_   qkv_attentionr`   )
r/   r&   ra   rb   rc   qkvwvqks
             r#   r+   zMultiHeadAttention.forward\   s     JJqMrzTXXX-E bjb1A


13A "A$A##Aq!T2Bxx|Rr"   rf   rg   rh   r'   c                    |j                   \  }}}|| j                  z  dz  } |j                  g |j                   d d | j                  d j                  dddd      } |j                  g |j                   d d | j                  d j                  dddd      } |j                  g |j                   d d | j                  d j                  dddd      }t        rRt
        j                  rBt        ||||d uxr |dkD        }	|	j                  dddd      j                  d      }
d }|
|fS ||z  ||z  j                  dd	      z  }|||d |d |f   z   }|j                         }t        j                  |d
      j                  |j                        }||z  j                  dddd      j                  d      }
|j                         }|
|fS )Ng      пr@   r   r      )	is_causal)	start_dimrA   )shaperZ   viewpermuteSDPA_AVAILABLErT   rU   r   flatten	transposer,   r6   softmaxr9   r.   detach)r/   rf   rg   rh   rb   n_batchn_ctxrY   scalear`   rj   ws                r#   re   z MultiHeadAttention.qkv_attentionr   s    #$''DKK'E1AFF1AGGBQK11b199!Q1EAFF1AGGBQK11b199!Q1EAFF1AGGBQK11b199!Q1E099,1a4t#3#A	A ))Aq!Q'//!/<CB Bw e)E	44R<<B$vvvv~..B		""%((1Aq5//!Q1-555BCBBwr"   NNNr)   )r   r   r   rU   r   r\   r	   r   dictr+   r   rE   re   r1   r2   s   @r#   rT   rT   Q   s    H, ,S ,  $!%#'   V  v	 
 4. . IM"'-5=f5E	u||Xell33	4r"   rT   c            
       `     e Zd Zd
dededef fdZ	 	 	 ddedee   dee   dee   fd	Z	 xZ
S )ResidualAttentionBlockrY   rZ   cross_attentionc                 d   t         |           t        ||      | _        t	        |      | _        |rt        ||      nd | _        |rt	        |      nd | _        |dz  }t        j                  t        ||      t        j                         t        ||            | _        t	        |      | _        y )N   )r*   r\   rT   attnr%   attn_ln
cross_attncross_attn_lnr
   
Sequentialr4   GELUmlpmlp_ln)r/   rY   rZ   r   n_mlpr0   s        r#   r\   zResidualAttentionBlock.__init__   s    &w7	 ) 4Cw/ 	 4CYw/!==7E"BGGIveW/E
  (r"   r&   ra   rb   rc   c                    || j                  | j                  |      ||      d   z   }| j                  r)|| j                  | j                  |      ||      d   z   }|| j	                  | j                  |            z   }|S )Nrb   rc   r   )rc   )r   r   r   r   r   r   )r/   r&   ra   rb   rc   s        r#   r+   zResidualAttentionBlock.forward   s{     		$,,q/x	HKK??DOOD$6$6q$92OQRSTTAQ((r"   )Fr~   )r   r   r   r   boolr\   r	   r   r   r+   r1   r2   s   @r#   r   r      sa    ) )S )4 )(  $!%#' V v	
 4.r"   r   c            
       @     e Zd Zdededededef
 fdZdefdZ xZS )	AudioEncoderr   rz   rY   rZ   n_layerc           	      L   t         |           t        ||dd      | _        t        ||ddd      | _        | j                  dt        ||             t        j                  t        |      D cg c]  }t        ||       c}      | _        t        |      | _        y c c}w )Nrm   r   )kernel_sizepaddingr@   )r   strider   positional_embedding)r*   r\   r<   conv1conv2register_bufferrR   r
   
ModuleListranger   blocksr%   ln_post)r/   r   rz   rY   rZ   r   _r0   s          r#   r\   zAudioEncoder.__init__   s     	FGAF
GW!AqQ
3Yug5NO8:>CGnM#GV4M9
 !) Ns   1B!r&   c                    t        j                  | j                  |            }t        j                  | j                  |            }|j	                  ddd      }|j
                  dd | j                  j
                  k(  sJ d       || j                  z   j                  |j                        }| j                  D ]
  } ||      } | j                  |      }|S )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   r@   r   Nzincorrect audio shape)r6   gelur   r   rs   rq   r   r9   r.   r   r   )r/   r&   blocks      r#   r+   zAudioEncoder.forward   s    
 FF4::a=!FF4::a=!IIaAwwqr{d77===V?VV=***..qww7[[ 	EaA	 LLOr"   )r   r   r   r   r\   r	   r+   r1   r2   s   @r#   r   r      s<    **"%*03*=@*KN* r"   r   c            
       P     e Zd Zdededededef
 fdZddeded	ee   fd
Z xZ	S )TextDecoderr   rz   rY   rZ   r   c           
         t         |           t        j                  ||      | _        t        j
                  t        j                  ||            | _        t        j                  t        |      D cg c]  }t        ||d       c}      | _        t        |      | _        t        j                  ||      j                  t         j"                         j%                  d      }| j'                  d|d       y c c}w )NT)r   r   rb   F
persistent)r*   r\   r
   	Embeddingtoken_embedding	ParameterrE   emptyr   r   r   r   r   r%   lnfill_rC   inftriu_r   )	r/   r   rz   rY   rZ   r   r   rb   r0   s	           r#   r\   zTextDecoder.__init__   s     	!||GW=$&LLUG1L$M!8: w 'wM9
 G${{5%(..w7==a@VTe<s   5C>r&   ra   rc   c                    |r/t        t        |j                                     j                  d   nd}| j	                  |      | j
                  |||j                  d   z    z   }|j                  |j                        }| j                  D ]  } |||| j                  |      } | j                  |      }|t        j                  | j                  j                  j                  |j                        dd      z  j                         }|S )z
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
            the encoded audio features to be attended on
        r   r   rl   r   )nextitervaluesrq   r   r   r9   r.   r   rb   r   rE   rv   r8   r,   )r/   r&   ra   rc   offsetr   logitss          r#   r+   zTextDecoder.forward   s     <Dd8??,-.44Q7  #''!''"+1EFG 	
 DDN[[ 	@Ea$))h?A	@ GGAJ 4 4 ; ; > >qww GANN
%' 	 r"   r)   )
r   r   r   r   r\   r	   r   r   r+   r1   r2   s   @r#   r   r      sN    ==#&=14=>A=LO=& V x~ r"   r   c                   2    e Zd Zdef fdZdefdZdej                  fdZ	dej                  dej                  fd	Z
dej                  dej                  d
eeej                  f   fdZed        Zed        Zed        Zddee   fdZeZeZeZ xZS )Whisperdimsc                 (   t         |           || _        t        | j                  j                  | j                  j
                  | j                  j                  | j                  j                  | j                  j                        | _	        t        | j                  j                  | j                  j                  | j                  j                  | j                  j                  | j                  j                        | _        t#        j$                  | j                  j                  | j                  j                  t"        j&                        }d|| j                  j                  dz  d  | j)                  d|j+                         d       y )Nr.   Tr@   alignment_headsFr   )r*   r\   r   r   r   r   r   r   r   encoderr   r   r   r   r   r   decoderrE   zerosr   r   	to_sparse)r/   r   	all_headsr0   s      r#   r\   zWhisper.__init__   s   	#IIII!!II##II""II##
 #IIII  II""II!!II""
 KKII""DII$9$9
	 48	$))((A-/0.	0C0C0ERWXr"   dumpc                    t        j                  t        j                  t	        j
                  |            t              j                         }t        j                  |      j                  | j                  j                  | j                  j                        }| j                  d|j                         d       y )Nr   r   Fr   )rC   
frombuffergzip
decompressbase64	b85decoder   copyrE   
from_numpyreshaper   r   r   r   r   )r/   r   arrayrb   s       r#   set_alignment_headszWhisper.set_alignment_heads  s    OOF,,T234

$& 	 &..II""DII$9$9
 	.0@USr"   melc                 $    | j                  |      S r)   )r   )r/   r   s     r#   embed_audiozWhisper.embed_audio  s    ||C  r"   tokensaudio_featuresc                 &    | j                  ||      S r)   )r   )r/   r   r   s      r#   r   zWhisper.logits"  s    ||FN33r"   r'   c                 D    | j                  || j                  |            S r)   )r   r   )r/   r   r   s      r#   r+   zWhisper.forward%  s     ||FDLL$566r"   c                 H    t        | j                               j                  S r)   )r   
parametersdevicer/   s    r#   r   zWhisper.device*  s    DOO%&---r"   c                 4    | j                   j                  dk\  S )Ni  )r   r   r   s    r#   is_multilingualzWhisper.is_multilingual.  s    yy  E))r"   c                 `    | j                   j                  dz
  t        | j                        z
  S )Ni5  )r   r   r   r   r   s    r#   num_languageszWhisper.num_languages2  s'    yy  5(3t/C/C+DDDr"   cachec                      i ni g  fddt         j                  ffd} j                  j                  |       fS )a  
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        c                     | vs&|j                   d   j                  j                  kD  r
|| <   |    S t        j                  |    |gd      j                         | <   |    S )Nr   rA   )rq   r   r   rE   rI   rx   )moduler   outputr   r/   s      r#   save_to_cachez5Whisper.install_kv_cache_hooks.<locals>.save_to_cacheG  sk    U"fll1o		8L8L&L &f =  !&		5=&*Aq I P P Rf= r"   layerc                     t        | t              rUj                  | j                  j	                               j                  | j
                  j	                               y y r)   )
isinstancerT   appendr^   register_forward_hookr_   )r   hooksr   s    r#   install_hooksz5Whisper.install_kv_cache_hooks.<locals>.install_hooksO  sI    %!34UYY<<]KLU[[>>}MN 5r"   )r
   Moduler   apply)r/   r   r   r   r   s   `` @@r#   install_kv_cache_hookszWhisper.install_kv_cache_hooks6  sO     #.	5	B	!	O 	O
 	=)e|r"   r)   )r   r   r   r   r\   bytesr   rE   r	   r   r   r   strr+   propertyr   r   r   r   r   r   detect_language_functionr   transcribe_functionr   decode_functionr   r1   r2   s   @r#   r   r      s    Y_ Y2T T!u|| !4U\\ 45<< 47<<7).7	c5<<	 7
 . . * * E EHTN B /O$JFr"   r   )i'  )+r   r   
contextlibr   dataclassesr   typingr   r   r   r   numpyrC   rE   torch.nn.functionalr
   
functionalr6   r	   decodingr   r   r   r   r   r   r   rt   ImportErrorRuntimeErrorOSErrorr   r%   r4   r<   rR   rW   r   rT   r   r   r   r   r!   r"   r#   <module>r      s     % ! 2 2      / A 9@N 
 
 
8 8

RYY 

RYY 
N 1 1: :zRYY @299 B*")) *Z]bii ]Q 	\7+ #' Ns   D D('D(