
    Ǆgvr                        d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
Zd dlZd dlZddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' erdd	l(m)Z) dd
dddddddddddddde	e*ejV                  ejX                  f   dee-   de	e.ee.df   f   dee.   dee.   dee.   de-dee*   de-d e*d!e*d"e	e*ee.   f   d#ee.   fd$Z/d% Z0e1d&k(  r e0        yy)'    N)TYPE_CHECKINGListOptionalTupleUnion   )FRAMES_PER_SECOND
HOP_LENGTHN_FRAMES	N_SAMPLESSAMPLE_RATElog_mel_spectrogrampad_or_trim)DecodingOptionsDecodingResult)add_word_timestamps)	LANGUAGESTO_LANGUAGE_CODEget_tokenizer)	exact_divformat_timestampget_end
get_writer	make_safeoptional_floatoptional_intstr2bool)Whisper)        皙?g?333333?g?      ?333333@      r!   TF   "'“¿([{-   "'.。,，!！?？:：”)]}、0)verbosetemperaturecompression_ratio_thresholdlogprob_thresholdno_speech_thresholdcondition_on_previous_textinitial_promptword_timestampsprepend_punctuationsappend_punctuationsclip_timestampshallucination_silence_thresholdmodelr   audior(   r)   .r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   c                   QRST j                  dd      rt        j                  nt        j                  } j                  t        j                  d      k(  rkt        j
                  j                         rt        j                  d       |t        j                  k(  r%t        j                  d       t        j                  }|t        j                  k(  rdd<   t        | j                  j                  t              }|j                  d   t        z
  }t        |t         z  t"        z        }j                  d	d
       j$                  sdd	<   n|rt'        d       t)        |t              j+                   j                        j+                  |      } j-                  |      \  }}t/        ||j                         d	<   |&t'        dt0        d	      j3                                 d	   }j                  dd      }t5         j$                   j6                  ||      St9        |t:              r+|r|j=                  d      ng D cg c]  }t        |       }}|D cg c]  }t?        |t@        z         }}tC        |      dk(  r|jE                  d       tC        |      dz  dk(  r|jE                  |       tG        tI        |d
d
d   |dd
d               }dQ|	r|dk(  rt        j                  d       dt        jJ                  dtL        f fd}d}||   d   RtO        t         j                  jP                        }|t         z  t"        z  }g }g } d}!|4SjS                  d|jU                         z         }"|jW                  |"       ng }"dt        dt        dt        jJ                  d tL        fRSfd!}#tY        jX                  |d"|du#      5 }$d$}%|tC        |      k  r-||   \  }&}'R|&k  r|&RR|'k\  r|dz  }|tC        |      k  r||   d   R?t        Rt         z  t"        z        }(t        Rt        z   t         z  t"        z        })t[        t        |Rz
  |'Rz
        }*|d
d
RR|*z   f   }|*t         z  t"        z  }+t)        |t              j+                   j                        j+                  |      }||!d
 d%<    ||      },t        j\                  |,j^                        }-+|,j`                  kD  }.|,jb                  kD  rd}.|.rR|*z  RAR}/g }0d&td        dt        fd'Tdtf        td           dth        fQTfd(}1d)tj        td           dtf        td           fd*}2|-jm                  Sjn                        }3|3d+d
 jq                         ddgk(  }4t        jr                  |3d
d |3dd
 z        d   }5|5ju                  d       tC        |5      dkD  r|5jq                         }6|4r|6jE                  tC        |-             d}7|6D ]p  }8|-|7|8 }9|9d   jw                         Sjn                  z
  }:|9d   jw                         Sjn                  z
  };|0jE                   |#|(|:|z  z   |(|;|z  z   |9|,,             |8}7r |4rR|*z  Rn|-|7dz
     jw                         Sjn                  z
  }<R|<|z  z  Rn|+}=|-|3jy                         j{                            }>tC        |>      dkD  rE|>d   jw                         Sjn                  k7  r%|>d   jw                         Sjn                  z
  }<|<|z  }=|0jE                   |#|(|(|=z   |-|,,             R|*z  R|	rt}        |0 S||*|
||%-       |4s$t        |0      }?|?|?|(kD  rt?        |?t@        z        R|@|}@|4s4t        |0      }?|?'|?|(kD  r"|)|?z
  }A|A@kD  rt?        |?t@        z        Rn|/|*z   R |2|0      }B|B, |1B      r$Bd   |(z
  }C|C@kD  r|/t?        Ct@        z        z   R|%}Dt        tC        |0            D ]  }E|0|E   }F|Fd.   s |1F      r |2|0Edz   d
       }G|GGd.   d   d   }Hn|(|+z   }HFd   Dz
  @kD  xs Fd   @k  xs Fd   |(z
  d/k  }IHFd   z
  @kD  xs  |1G      xs |)Fd   z
  d/k  }JIr8Jr6t?        t/        |(dz   Fd         t@        z        R||Fd   z
  @k  r|Rg |0Ed
  nFd   }D t        |0      }?|?|?}%|rG|0D ]B  }F|Fd   |Fd   |Fd0   }M}L}Kd1t        |K       d2t        |L       d3|M }Nt'        t        |N             D t        |0      D ]6  \  }O}F|Fd   |Fd   k(  sFd0   jU                         d4k(  s(d4Fd0<   g |Fd<   g |Fd.<   8 | jW                  t        |0tC        |       5      D OFcg c]  \  }O}Fd6|Oi|F c}F}O       |jW                  |0D FPcg c]  }F|Fd   D ]  }P|P  c}P}F       |r|,j                  d7kD  rtC        |      }!|$j                  t[        |R      |/z
         |tC        |      k  r-d
d
d
       te        Sj                  |tC        |"      d
       | |8      S c c}w c c}w c c}F}Ow c c}P}Fw # 1 sw Y   HxY w)9ay  
    Transcribe an audio file using Whisper

    Parameters
    ----------
    model: Whisper
        The Whisper model instance

    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform

    verbose: bool
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything

    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.

    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed

    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed

    no_speech_threshold: float
        If the no_speech probability is higher than this value AND the average log probability
        over sampled tokens is below `logprob_threshold`, consider the segment as silent

    condition_on_previous_text: bool
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.

    word_timestamps: bool
        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
        and include the timestamps for each word in each segment.

    prepend_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the next word

    append_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the previous word

    initial_prompt: Optional[str]
        Optional text to provide as a prompt for the first window. This can be used to provide, or
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.

    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances

    clip_timestamps: Union[str, List[float]]
        Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
        The last end timestamp defaults to the end of the file.

    hallucination_silence_threshold: Optional[float]
        When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
        when a possible hallucination is detected

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    fp16Tcpuz2Performing inference on CPU when CUDA is availablez0FP16 is not supported on CPU; using FP32 insteadF)paddinglanguageNenz]Detecting language using up to the first 30 seconds. Use `--language` to specify the language)keyzDetected language: task
transcribe)num_languagesr;   r>   ,r      r   u*   "'“¿([{-"'.。,，!！?？:：”)]}、	translatez:Word-level timestamps on translations may not be reliable.segmentreturnc                    t        t        t        f      rgn}d }|D ]  }i }|dkD  r%|j                  dd        |j                  dd        n|j                  dd        t	        di |d|i}
j                  | |      }d}|j                  kD  rd}	|j                  	k  rd}|j                  kD  rd}|r |S  |S )	Nr   	beam_sizepatiencebest_ofr)   FT )	
isinstanceintfloatpopr   decodecompression_ratioavg_logprobno_speech_prob)rD   temperaturesdecode_resulttkwargsoptionsneeds_fallbackr*   decode_optionsr+   r4   r,   r)   s          Z/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/whisper/transcribe.pydecode_with_fallbackz(transcribe.<locals>.decode_with_fallback   s    'c5\B[M 	  	A''F1u

;-

:t, 

9d+%>>A>G!LL':M"N+7!336QQ!%!-!--0AA!%#/!003FF!&!A	@      startendtokensresultc           
          |j                         }|D cg c]  }|j                  k  s| }}| |j                  |      ||j                  |j                  |j
                  |j                  d	S c c}w )N)	seekr^   r_   textr`   r)   rQ   rP   rR   )tolisteotrO   r)   rQ   rP   rR   )r^   r_   r`   ra   tokentext_tokensrc   	tokenizers         rZ   new_segmentztranscribe.<locals>.new_segment   s|     *0JEIMM4IuJJ$$[1!--!--!'!9!9$33

 
	
 Ks
   A4A4frames)totalunitdisabler   promptwordc                     | j                  dd      }| d   | d   z
  }d}|dk  r|dz  }|dk  r|d|z
  dz  z  }|d	kD  r||d	z
  z  }|S )
Nprobabilityr   r_   r^   g333333?r"   g/$?          @)get)rp   rr   durationscores       rZ   word_anomaly_scorez&transcribe.<locals>.word_anomaly_score,  sp    "hh}c:;g6%SLEe#eh."44Ec>X^+Er\   c                     | | d   sy| d   D cg c]  }|d   vs| }}|d d }t        fd|D              }|dk\  xs |dz   t        |      k\  S c c}w )NwordsFrp      c              3   .   K   | ]  } |        y w)NrJ   ).0wrx   s     rZ   	<genexpr>z9transcribe.<locals>.is_segment_anomaly.<locals>.<genexpr>=  s     Aa.q1As      g{Gz?)sumlen)rD   r~   rz   rw   punctuationrx   s       rZ   is_segment_anomalyz&transcribe.<locals>.is_segment_anomaly8  sr    ?''*: $+G$4Uq&	8TUUbq	A5AAz?UT\SZ%?? Vs
   AAsegmentsc                 (    t        d | D        d       S )Nc              3   ,   K   | ]  }|d    s	|  yw)rz   NrJ   )r}   ss     rZ   r   z9transcribe.<locals>.next_words_segment.<locals>.<genexpr>A  s     ?1AgJQ?s   
)next)r   s    rZ   next_words_segmentz&transcribe.<locals>.next_words_segment@  s    ??FFr\   )r^   r_   r`   ra   )r   r4   ri   mel
num_framesr0   r1   last_speech_timestamprz   rt   rd   [z --> z]  )r^   idg      ?)rd   r   r;   )Gru   torchfloat16float32devicecudais_availablewarningswarnr   dimsn_melsr   shaper   rM   r
   r   is_multilingualprintr   todetect_languagemaxr   titler   r@   rK   strsplitroundr	   r   appendlistzipTensorr   r   n_audio_ctxencodestripextendtqdmmintensorr`   rR   rQ   dictr   boolr   getimestamp_beginre   whereadd_itemnonzeroflattenr   r   ranger   r   	enumerater)   updaterO   )Ur4   r5   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   rY   dtyper   content_framescontent_durationmel_segment_probsr;   r>   tsseek_points
seek_clipsr[   clip_idxinput_stridetime_precision
all_tokensall_segmentsprompt_reset_sinceinitial_prompt_tokensrj   pbarr   seek_clip_startseek_clip_endtime_offsetwindow_end_timesegment_sizesegment_durationra   r`   should_skipprevious_seekcurrent_segmentsr   r   timestamp_tokenssingle_timestamp_endingconsecutiveslices
last_slicecurrent_slicesliced_tokensstart_timestamp_posend_timestamp_poslast_timestamp_posrv   
timestampslast_word_end	thresholdremaining_durationfirst_segmentgaphal_last_endsirD   next_segmenthal_next_startsilence_beforesilence_afterr^   r_   rd   lineirg   r   rc   ri   rx   sU   `  ````       `                                                                  @@@@rZ   r?   r?   &   s   f ,//=EMM5==E||u||E**::""$MMNOEMM!MMLMMME!&v eUZZ%6%6	
JCYYr]X-N^j8;FG*d+3$$)-N:&s &c8477EHHOK,,[9HAu),U		)BN:&"))N:4N*O*U*U*W)XY #:.H""6<8D))	I /3'?N!6!6s!;TV
E"I
 
 GVVeB):$:;VKV
;11
;!q >*(,SSqS1A;qtRStCT-U(VJ@K4;.RS&ell &~ & &P Hh"D%**((L 	z!K/  JL! ) 0 0~7K7K7M1M N/0 "

#
-2\\
CQ
$ 
8WE5I
 nC	 #
 Z(-7-A*O]o%&}$Ac*o-%h/2Dz 1K ?@K#TH_
$B[$PQOx$)>PT@TULa|(;!;;<K+j8;F%k8<??MPPQVWK'12D2E'FN8$%9+%FF\\&--0F".$336II%1**->> #(KL(D M!
 
% 
@HTN @t @GT$Z GHTN G .4YYy7P7P-Q&6rs&;&B&B&DPT&U#++&6s&;>Nqr>R&RSTUVKQ;!#$++-*MM#f+.
%+ /M$*:m$DM%a(--/)2K2KK ( &b)..093L3LL & %++#"-0Cn0T"T +.?..P P#0#)	 "/J!/$ +L(D zA~.335	8Q8QQ ' .==D+#$4$<$<$>$F$F$HI

Oa'"2++-1J1JJ #2++-	0I0II '  2NBH '')'(2%%	 $#-'#+)=(;*?	 /$+,<$=M$0][5P$]5F%FG 3> ?I2(/0@(A(49T1@=1P.1I=',]=N-N'O'4|'C %77G$HM$05G5V+G4{B?#05?P9P3Q#QD$ $9L#C(8$9: 6"22"6&w/$-g6+= 0a :,L  ,71=g1Fq1I'1R1<?O1O ' 0< ?) K !H#*7#3i#?!H#*7#3k#AC#G + !/ ?) K !J#5l#C!J#2WU^#Cc#I *
  .-',$'a9I$J&7%8(" $4gen#Dy#P+9D8: 0 5 %'.u~?6B !((8 9 ,,9)/ +G'.w'7QW3E/67u=Mc=R<SSUVZU[\D)D/*+ ((89 *
77#wu~59N9N9PTV9V&(GFO(*GH%')GG$	*  '0(L0A'"7 1(( &6V7GHDUV5VV .1C1Cc1I%(_" KKND1MABM Z(nC` j-B)C)EFG W

 Wl	 WMnC nCs>   i/i*U<i0'5i0i$,i0i*Ai0$i00i9c                  p   ddl m fd} t        j                  t        j                        }|j                  ddt        d       |j                  d	d
| d       |j                  dt        d d       |j                  dt        j                  j                         rdndd       |j                  ddt        dd       |j                  ddt        dg dd       |j                  dt        d d!       |j                  d"t        d#d#d$gd%       |j                  d&t        d t        t        j                               t        t        j                         D cg c]  }|j                          c}      z   d'       |j                  d(t         d)d*       |j                  d+t"        d,d-       |j                  d.t"        d,d/       |j                  d0t         d d1       |j                  d2t         d d3       |j                  d4t        d5d6       |j                  d7t        d d8       |j                  d9t        d d:       |j                  d;t        d d<       |j                  d=t$        d>d?       |j                  d@t$        dAdB       |j                  dCt$        dDdE       |j                  dFt$        dGdH       |j                  dIt        dJdK       |j                  dLt        dMdN       |j                  dOt        dPdQ       |j                  dRt        dJdS       |j                  dTt"        d dU       |j                  dVt"        d dW       |j                  dXt"        d dY       |j                  dZt"        d)d[       |j                  d\t        d]d^       |j                  d_t$        d`a       |j'                         j(                  }|j+                  db      }|j+                  dc      }|j+                  dd      }|j+                  de      }|j+                  df      }t-        j.                  |d g       |j1                  dh      r/|di   djvr(|di   t3        j4                  | dk|di    dl       dm|di<   |j+                  dn      }	|j+                  do      x}
!t7        t9        j:                  |	dp|
            }	n|	g}	|j+                  dq      x}d)kD  rt        j<                  |       ddrl m}  ||||s      }tA        ||      }g dt}|du   s"|D ]  }||   s	|jC                  dv| dw        |dx   r|dy   st3        j4                  dz       |d{   r|dy   rt3        j4                  d|       |D ci c]  }||j+                  |       }}|j+                  d      D ]  }	 tE        ||fdn|	i|} |||fi |  y c c}w c c}w # tF        $ rN}tI        jJ                          tM        d}| d~tO        |      jP                   dt        |              Y d }~|d }~ww xY w)Nr   )available_modelsc                     |         v st         j                  j                  |       r| S t        d         d      )Nzmodel should be one of z or path to a model checkpoint)ospathexists
ValueError)namer   s    rZ   valid_model_namezcli.<locals>.valid_model_name  sB    #%%)=K%&6&8%99WX
 	
r\   )formatter_classr5   +zaudio file(s) to transcribe)nargstypehelpz--modelturboz name of the Whisper model to use)defaultr   r   z--model_dirz>the path to save model files; uses ~/.cache/whisper by default)r   r   r   z--devicer   r8   z#device to use for PyTorch inference)r   r   z--output_dirz-o.zdirectory to save the outputsz--output_formatz-fall)txtvttsrttsvjsonr   zSformat of the output file; if not specified, all available formats will be produced)r   r   choicesr   z	--verboseTz4whether to print out the progress and debug messagesz--taskr?   rC   zawhether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')z
--languagezHlanguage spoken in the audio, specify None to perform language detectionz--temperaturer   ztemperature to use for samplingz	--best_of   z<number of candidates when sampling with non-zero temperaturez--beam_sizezHnumber of beams in beam search, only applicable when temperature is zeroz
--patiencezoptional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam searchz--length_penaltyzoptional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by defaultz--suppress_tokensz-1zcomma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuationsz--initial_promptz:optional text to provide as a prompt for the first window.z--condition_on_previous_textzif True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loopz--fp16z5whether to perform inference in fp16; True by defaultz#--temperature_increment_on_fallbackr    zhtemperature to increase when falling back when the decoding fails to meet either of the thresholds belowz--compression_ratio_thresholdr#   zUif the gzip compression ratio is higher than this value, treat the decoding as failedz--logprob_thresholdr$   zUif the average log probability is lower than this value, treat the decoding as failedz--no_speech_thresholdr!   zif the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silencez--word_timestampsFzQ(experimental) extract word-level timestamps and refine the results based on themz--prepend_punctuationsr%   zNif word_timestamps is True, merge these punctuation symbols with the next wordz--append_punctuationsr&   zRif word_timestamps is True, merge these punctuation symbols with the previous wordz--highlight_wordszT(requires --word_timestamps True) underline each word as it is spoken in srt and vttz--max_line_widthze(requires --word_timestamps True) the maximum number of characters in a line before breaking the linez--max_line_countzJ(requires --word_timestamps True) the maximum number of lines in a segmentz--max_words_per_linezk(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segmentz	--threadsz]number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADSz--clip_timestampsr'   zcomma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the filez!--hallucination_silence_thresholdz(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected)r   r   r4   	model_dir
output_diroutput_formatr   )exist_okz.enr;   >   r<   Englishz) is an English-only model but receipted 'z'; using English instead.r<   r)   !temperature_increment_on_fallbackgzo ?threads)
load_model)r   download_root)highlight_wordsmax_line_countmax_line_widthmax_words_per_liner/   z--z  requires --word_timestamps Truer  r  z7--max_line_count has no effect without --max_line_widthr  z8--max_words_per_line has no effect with --max_line_widthz	Skipping z due to z: ))r   r   argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentr   r   r   r   r   sortedr   keysr   r   rM   r   r   
parse_args__dict__rN   r   makedirsendswithr   r   tuplenparangeset_num_threadsr  r   errorr?   	Exception	traceback	print_excr   r   __name__)r   parserkargs
model_namer  r	  r
  r   r)   	incrementr  r  r4   writerword_optionsoptionargwriter_args
audio_pathra   er   s                         @rZ   clir4    s   "
 $$X5[5[\F
s;XY
	79IPrs
C  EE  F

ejj6M6M6OFUZ  bG  H
3Jij
)4c5R}  EZ  [
(DG}~
sL<YdJe  mP  Q
3fY^^M]F^ag  }M  }R  }R  }T  iUwxijipipir  iU  bV  GV  ]g  h
eQEfg
,  IG  H
L!  KU  V
5$  F\  ]
*  LS  T
+#t  KQ  R
*d  JF  G
6Xt  [l  m
xD{|
=N\_  gQ  R
7nVY  ax  y
-ND  Xo  p
/nc  Yy  z
+(E  Qd  e
0sO  [k  l
/cCf  nB  C
+(E  Qg  h
*t  Sz  {
*t  S_  `
.\4  WD  E
,  Ih  i
+#s  Jb  c
;.  Xa  b ''Dhhw'JXXk*Ihh|,J/2M((8$FKK
T*5!d:&6>O&O
'MM,GZHXGYYrs  Z((=)KXXABB	OBIIk:yIJ"m88I&&!+g&z&	JEz2FL !"" 	LFF|r&)IJK	L d+;&<OP !d+;&<PQ1=>#3%>K>hhw' P
	PzS{SdSF6:55PS iUP ?
  	P!Ij\$q'2B2B1C2c!fXNOO	Ps%   !WW8W	X5'AX00X5__main__)2r  r   r%  r   typingr   r   r   r   r   numpyr   r   r   r5   r	   r
   r   r   r   r   r   decodingr   r   timingr   ri   r   r   r   utilsr   r   r   r   r   r   r   r   r4   r   r   ndarrayr   r   rM   r?   r4  r'  rJ   r\   rZ   <module>r<     s    	   > >      6 ' A A	 	 	  #3Q36)-+.'+$(! .A/27;LLbjj%,,./L d^	L
 ueE3J//0L "*%L  L "%L !%L SML L L L 3U+,L &.e_L^dPN zE r\   