o
    i                     @   s   d dl Z d dlmZ d dlm  mZ d dlZd dlmZ d dl	m
Z
 d dlZG dd dejZG dd dejZG dd	 d	ejZG d
d dejZG dd dejZdS )    N)Optional)optimized_attention_maskedc                       s&   e Zd Zd fdd	Zdd Z  ZS )WhisperFeatureExtractor   Nc              
      s^   t    d| _d| _d| _|| _d| _d| _tj	j
| j| j| j| jddddd	|| _d S )
Ni>  i        i S r   i@  slaney)sample_raten_fft
hop_lengthn_melsf_minf_maxnorm	mel_scale)super__init__r	   r
   r   r   chunk_length	n_samples
torchaudio
transformsMelSpectrogramtomel_spectrogram)selfr   device	__class__ :/mnt/c/Users/fbmor/ComfyUI/comfy/audio_encoders/whisper.pyr   
   s&   
	z WhisperFeatureExtractor.__init__c                 C   s   t j|dd}|jd }g }t|D ]1}|| }|jd | jkr(|d | j }n|jd | jk r>t|d| j|jd  f}|| qt |}| 	|
| j	jjjd d d d d df 
|j}t j|dd }t || d }|d d }|S )	N   )dimr   g|=)ming       @g      @)torchmeanshaperanger   Fpadappendstackr   r   spectrogramwindowr   clamplog10maximummax)r   audio
batch_sizeprocessed_audioiaudmel_speclog_mel_specr   r   r   __call__   s    

6z WhisperFeatureExtractor.__call__)r   N)__name__
__module____qualname__r   r9   __classcell__r   r   r   r   r   	   s    r   c                       sX   e Zd Zddedef fddZ	ddejdejdejd	eej d
ejf
ddZ  Z	S )MultiHeadAttentionNd_modeln_headsc                    s   t    || dksJ || _|| _|| | _|j||||d| _|j||d||d| _|j||||d| _|j||||d| _	d S )Nr   dtyper   F)biasrB   r   )
r   r   r?   r@   d_kLinearq_projk_projv_projout_proj)r   r?   r@   rB   r   
operationsr   r   r   r   7   s   

zMultiHeadAttention.__init__querykeyvaluemaskreturnc                 C   sJ   |j \}}}| |}| |}	| |}
t||	|
| j|}| |}|S N)r&   rF   rG   rH   r   r@   rI   )r   rK   rL   rM   rN   r3   seq_len_qkvattn_outputr   r   r   forwardD   s   



zMultiHeadAttention.forwardNNNrP   
r:   r;   r<   intr   r$   Tensorr   rW   r=   r   r   r   r   r>   6   s    r>   c                       sP   e Zd Zddededef fddZ	ddejdeej d	ejfd
dZ  Z	S )EncoderLayerNr?   r@   d_ffc                    sn   t    t|||||d| _|j|||d| _|j||||d| _|j||||d| _|j|||d| _	d S )NrB   r   rJ   rA   )
r   r   r>   	self_attn	LayerNormself_attn_layer_normrE   fc1fc2final_layer_norm)r   r?   r@   r]   rB   r   rJ   r   r   r   r   X   s   
zEncoderLayer.__init__xattention_maskrO   c                 C   s^   |}|  |}| ||||}|| }|}| |}| |}t|}| |}|| }|S rP   )ra   r_   rd   rb   r(   gelurc   )r   re   rf   residualr   r   r   rW   b   s   




zEncoderLayer.forwardrX   rP   rY   r   r   r   r   r\   W   s    r\   c                       sZ   e Zd Z								ddeded	ed
edef
 fddZdejdejfddZ  ZS )AudioEncoderr               Nr   n_ctxn_staten_headn_layerc	           	   	      s   t    j|dd d| _jddd d| _j| d| _t fddt	|D | _
j d| _d S )	N   r    )kernel_sizepaddingrB   r      )rs   stridert   rB   r   rA   c              
      s$   g | ]}t d   dqS )   r^   )r\   ).0rR   r   rB   rp   ro   rJ   r   r   
<listcomp>   s    z)AudioEncoder.__init__.<locals>.<listcomp>)r   r   Conv1dconv1conv2	Embeddingembed_positionsnn
ModuleListr'   layersr`   
layer_norm)	r   r   rn   ro   rp   rq   rB   r   rJ   r   ry   r   r   w   s   

zAudioEncoder.__init__re   rO   c                 C   s   t | |}t | |}|dd}|tj| jj	d d d |j
d f | }d}| jD ]}||f7 }||}q1| |}||f7 }||fS )Nr    ru   r   )r(   rg   r|   r}   	transposecomfyopscast_to_inputr   weightr&   r   r   )r   re   all_xlayerr   r   r   rW      s   ,




zAudioEncoder.forwardr   rj   rk   rl   rm   NNN)	r:   r;   r<   rZ   r   r$   r[   rW   r=   r   r   r   r   ri   v   s*    ri   c                       sL   e Zd Z								ddeded	ed
edef
 fddZdd Z  ZS )WhisperLargeV3r   rj   rk   rl   rm   Nr   n_audio_ctxn_audio_staten_audio_headn_audio_layerc	           	   
      s6   t    t||d| _t||||||||d| _d S )N)r   r   r^   )r   r   r   feature_extractorri   encoder)	r   r   r   r   r   r   rB   r   rJ   r   r   r   r      s   

zWhisperLargeV3.__init__c                 C   s    |  |}| |\}}||fS rP   )r   r   )r   r2   melre   r   r   r   r   rW      s   
zWhisperLargeV3.forwardr   )r:   r;   r<   rZ   r   rW   r=   r   r   r   r   r      s*    r   )r$   torch.nnr   torch.nn.functional
functionalr(   r   typingr   comfy.ldm.modules.attentionr   	comfy.opsr   Moduler   r>   r\   ri   r   r   r   r   r   <module>   s    -!,