
    Ǆg20                        d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlZi ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&i d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHi dIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidji dkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddddddddddddi ddddddddddddddddddddddddddddddddddddddddddddddddddZi ej!                         D  ci c]  \  } }|| 
 c}} dddddddd7d7ddddZe G d dë             Z edī      ddedefdȄ       Z edī      ddddɜdededee   dee   def
d΄       Zyc c}} w )    N)	dataclassfield)cached_property	lru_cache)DictListOptionalTupleenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese	cantonese)salbmybotlmgastthawlnhabajwsuyuer   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarinc                   \   e Zd ZU dZej
                  ed<   eed<   dZe	e
   ed<   dZe	e
   ed<   dZee   ed<    ee	      Zee
ef   ed
<   d Zd Zdee   de
fdZdee   de
fdZedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Z edefd       Z!d Z"edee   fd       Z#edee
   fd       Z$edee   fd       Z%edee   fd       Z&d ee   fd!Z'd ee   fd"Z(d ee   fd#Z)y)$	TokenizerzIA thin wrapper around `tiktoken` providing quick access to special tokensencodingnum_languagesNlanguagetask sot_sequence)default_factoryspecial_tokensc                 :   | j                   j                  D ],  }| j                   j                  |      }|| j                  |<   . | j                  d   }| j                  d   }| j                  d   }t	        t
        j                               d | j                   }|g}| j                  0|j                  |dz   |j                  | j                        z          | j                  $| j                  dk(  r|n|}|j                  |       t	        |      | _        y )N<|startoftranscript|><|translate|><|transcribe|>   
transcribe)r   special_tokens_setencode_single_tokenr   tuple	LANGUAGESkeysr   r   appendindexr   r   )	selfspecialspecial_tokensot	translater   langsr   
task_tokens	            Y/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/whisper/tokenizer.py__post_init__zTokenizer.__post_init__   s    }}77 	9G MM==gFM+8D(	9 &&'>?,,_=	--.>?
inn&'(<$*<*<=u==$a%++dmm*D DE99 ,0II,Ej9J
+!,/    c                 <     | j                   j                  |fi |S N)r   encode)r   textkwargss      r   r  zTokenizer.encode   s    #t}}##D3F33r   	token_idsreturnc                     |D cg c]  }|| j                   k  s| }} | j                  j                  |fi |S c c}w r   )timestamp_beginr   decode)r   r  r  ts       r   r  zTokenizer.decode   sD     )F1Q1E1E-EQF	F#t}}##I888 Gs   ==c                 <     | j                   j                  |fi |S )z
        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        )r   r  )r   r  r  s      r   decode_with_timestampsz Tokenizer.decode_with_timestamps   s     
 $t}}##I888r   c                 .    | j                   j                  S r   )r   	eot_tokenr   s    r   eotzTokenizer.eot   s    }}&&&r   c                      | j                   d   S )Nr   r   r  s    r   r   zTokenizer.transcribe   s    ""#344r   c                      | j                   d   S )Nr   r  r  s    r   r   zTokenizer.translate       ""?33r   c                      | j                   d   S )Nr   r  r  s    r   r   zTokenizer.sot   s    ""#:;;r   c                      | j                   d   S )N<|startoflm|>r  r  s    r   sot_lmzTokenizer.sot_lm   r  r   c                      | j                   d   S )N<|startofprev|>r  r  s    r   sot_prevzTokenizer.sot_prev   s    ""#455r   c                      | j                   d   S )N<|nospeech|>r  r  s    r   	no_speechzTokenizer.no_speech   s    "">22r   c                      | j                   d   S )N<|notimestamps|>r  r  s    r   no_timestampszTokenizer.no_timestamps   s    ""#566r   c                      | j                   d   S )Nz<|0.00|>r  r  s    r   r  zTokenizer.timestamp_begin   s    "":..r   c                 f    | j                   t        d      | j                  | j                         S )zGReturns the token id corresponding to the value of the `language` fieldz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr  s    r   language_tokenzTokenizer.language_token   s/     == UVV%%dmm44r   c                 h    | j                   j                  d| dd       x}r|S t        d| d      )N<||>z	Language z not found in tokenizer.)r   getKeyError)r   r   tokens      r   r$  zTokenizer.to_language_token   sC    ''++b
",=tDD5DL8*,DEFFr   c                     g }| j                   j                         D ].  \  }}|j                  d      t        v s|j	                  |       0 t        |      d | j                   S )N<|>)r   itemsstripr   r   r   r   )r   resultr+  token_ids       r   all_language_tokenszTokenizer.all_language_tokens   s`    #2288: 	(OE8{{5!Y.h'	( V}1t1122r   c                 @     t         fd j                  D              S )Nc              3   `   K   | ]%  }j                  |g      j                  d        ' yw)r-  N)r  r/  ).0_lr   s     r   	<genexpr>z/Tokenizer.all_language_codes.<locals>.<genexpr>   s'     WT[["&,,U3Ws   +.)r   r2  r  s   `r   all_language_codeszTokenizer.all_language_codes   s    Wd>V>VWWWr   c                 Z    t        t        | j                        | j                  gz         S r   )r   listr   r   r  s    r   #sot_sequence_including_notimestampsz-Tokenizer.sot_sequence_including_notimestamps   s&    T$++,0B0B/CCDDr   c                    t        d      }|dj                         z  }t        d      }t        d |D              sJ | j                  j                  d      d   | j                  j                  d      d   h}|t        |      z   D ]g  }| j                  j                  |      | j                  j                  d|z         fD ])  }t        |      d	k(  s||v s|j                  |d          + i t        t        |            S )
u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   N   K   | ]  }d t        |      cxk  xr dk  nc   yw)i@&  i&  N)ord)r5  cs     r   r7  z.Tokenizer.non_speech_tokens.<locals>.<genexpr>  s!     E!6SV-v--Es   #%z -r   z ' r   )
r:  splitsetallr   r  lenaddr   sorted)r   symbolsmiscellaneousr0  symboltokenss         r   non_speech_tokenszTokenizer.non_speech_tokens   s    =>Z``b	
 34E}EEEE --&&t,Q/1E1Ed1KA1NO] 33 	*F$$V,$$S6\2 * v;!#v'>JJvay)*	* VF^$$r   rJ  c                 b    | j                   dv r| j                  |      S | j                  |      S )N>   r   r   r   rG   r   r   )r   split_tokens_on_unicodesplit_tokens_on_spaces)r   rJ  s     r   split_to_word_tokenszTokenizer.split_to_word_tokens  s3    ==AA //77**622r   c                 4   | j                  |      }d}g }g }g }d}|D ]u  }|j                  |       | j                  |      }	||	vs|||	j                  |      z      |k(  sD|j                  |	       |j                  |       g }|t        |	      z  }w ||fS )Nu   �r   )r  r   r   rD  )
r   rJ  decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr+  decodeds
             r   rM  z!Tokenizer.split_tokens_on_unicode  s    226:# 	/E!!%(11.AG !/?O1P PQ#$ W%"">2!##g,.	/ k!!r   c                    | j                  |      \  }}g }g }t        ||      D ]  \  }}|d   | j                  k\  }|j                  d      }	|j	                         t
        j                  v }
|s|	s|
st        |      dk(  r#|j                  |       |j                  |       |d   |z   |d<   |d   j                  |        ||fS )Nr   r@  )
rM  zipr  
startswithr/  stringpunctuationrD  r   extend)r   rJ  subwordssubword_tokens_listrS  rT  subwordsubword_tokensr   
with_spacer]  s              r   rN  z Tokenizer.split_tokens_on_spaces7  s    (,(D(DV(L%%'*85H'I 		7#G^$Q'4883G ++C0J!--/V-?-??K*s5zQW%"">2!"I/b	B&&~6		7 k!!r   )*__name__
__module____qualname____doc__tiktokenEncoding__annotations__intr   r	   strr   r   r
   r   dictr   r   r   r  r   r  r  r   r  r   r   r   r  r  r  r   r  r%  r$  r2  r8  r;  rK  rO  rM  rN  r   r   r   r   r      sW   S"Hhsm"D(3-!L%*!%*4%@NDcN@0&49S	 9 99S	 9 9 'S ' ' 5C 5 5 43 4 4 <S < < 4 4 4 6# 6 6 33 3 3 7s 7 7 / / / 5 5 5G 3U3Z 3 3 XE#J X X EU3Z E E !%5: !% !%F349 3"d3i "2"T#Y "r   r   )maxsizec   namer   c                    t         j                  j                  t         j                  j                  t              d|  d      }d t        |      D        D ci c]$  \  }}t        j                  |      t        |      & }}}t        |      }i }ddgt        t        j                               d | D cg c]  }d| d
 c}dd	d
dddt        d      D 	cg c]  }	d|	dz  dd c}	}
|
D ]  }|||<   |dz  } t        j                  t         j                  j!                  |      |d||      S c c}}w c c}w c c}	w )Nassetsz	.tiktokenc              3   B   K   | ]  }|s|j                           y wr   )rA  )r5  lines     r   r7  zget_encoding.<locals>.<genexpr>O  s     NTDJJLNs   z<|endoftext|>r   r'  r(  r   r   r  r  r  r  i  g{Gz?z.2fr   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)rp  explicit_n_vocabpat_strmergeable_ranksr   )ospathjoindirname__file__openbase64	b64decoderk  rD  r:  r   r   rangerh  ri  basename)rp  r   
vocab_pathr+  rankranksn_vocabr   langispecialss              r   get_encodingr  J  s   bggooh7dV9CUVJ OT*5ENE4 	T*E  %jGN 	 %))9$:>M$J	KDBtfB-	K 		
 	 	 	 	 	 */t	5ABq4xnB
	5H   'u1 WWj) a% 1 
L 
6s   )D?9E"E
)r   r   r   multilingualr   r   r  c                    |8|j                         }|t        vr |t        v r
t        |   }nt        d|       | rd}|xs d}|xs d}nd}d }d }t	        ||      }t        ||||      S )NzUnsupported language: r  r   r   gpt2)rp  r   )r   r   r   r   )lowerr   TO_LANGUAGE_CODEr#  r  r   )r  r   r   r   encoding_namer   s         r   get_tokenizerr  n  s     >>#9$+++H5 #9(!DEE&#t#|mLHPT r   )r  ro  )r~  rx  r\  dataclassesr   r   	functoolsr   r   typingr   r   r	   r
   rh  r   r.  r  r   rl  rk  r  boolr  )coder   s   00r   <module>r     s    	  ( 0 . . e)e)e 	(e 	)	e
 	)e 	(e 	(e 	*e 	,e 	)e 	(e 	)e 	'e 	(e 	)e  	)!e" 	,#e$ 	'%e& 	)'e( 	,)e* 	(+e, 	+-e. 	'/e0 	'1e2 	'3e4 	*5e6 	(7e8 	+9e: 	';e< 	+=e> 	&?e@ 	&AeB 	*CeD 	+EeF 	,GeH 	'IeJ 	'KeL 	+MeN 	'OeP 	(QeR 	(SeT 	)UeV 	)WeX 	)YeZ 	)[e\ 	-]e^ 	+_e` 	)aeb 	*ced 	,eef 	(geh 	(iej 	+kel 	*men 	(oep 	+qer 	)set 	(uev 	*wex 	)yez 	*{e| 	)}e~ 	)e@ 	)AeB 	'CeD 	'EeF 	(GeH 	(IeJ 	+KeL 	)MeN 	*OeP 	,QeR 	'SeT 	(UeV 	*WeX 	)YeZ 	)[e\ 	%]e^ 	'_e` 	)aeb 	
ced 	(eef 	)geh 	)iej 	)kel 











Ie	P,5OO,=>.$x~> " C" C" C"L 4 s  C    F 4 "  sm	
 3-  y ?s   G