o
    iob                     @   s*  d dl Z d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	Zd dl
mZmZ G dd de jjZG dd de jjZG dd	 d	e jjZd
d ZG dd dZG dd dZG dd dZG dd dZG dd deZG dd dZG dd dZG dd de jjZeeeedZdS )    N)nn)WanMultiTalkAttentionBlockMultiTalkAudioProjModelc                       s,   e Zd Zddef fddZdd Z  ZS )	BlockWiseControlBlock   Ndimc                    sV   t    |j|dd| _|j|dd| _|||| _tj	 | _
|||| _d S )Ngư>)eps)super__init__RMSNormx_rmsy_rmsLinear
input_projtorchr   GELUactoutput_proj)selfr   devicedtype
operations	__class__ </mnt/c/Users/fbmor/ComfyUI/comfy_extras/nodes_model_patch.pyr
      s   
zBlockWiseControlBlock.__init__c                 C   s<   |  || |}}| || }| |}| |}|S N)r   r   r   r   r   )r   xyr   r   r   forward   s
   

zBlockWiseControlBlock.forward)r   NNN)__name__
__module____qualname__intr
   r   __classcell__r   r   r   r   r      s    r   c                	       sJ   e Zd Z					ddededed	ef fd
dZdd Zdd Z  ZS )QwenImageBlockWiseControlNet<   @   r   r   N
num_layersin_dimadditional_in_dimr   c                    sR   t    || _j||  d| _tj fddt|D | _	d S )N)r   r   c                    s   g | ]
}t  d qS r   r   r   )r   .0_r   r   r   r   r   r   
<listcomp>,   s    z9QwenImageBlockWiseControlNet.__init__.<locals>.<listcomp>)
r	   r
   r*   r   img_inr   r   
ModuleListrangecontrolnet_blocks)r   r(   r)   r*   r   r   r   r   r   r0   r   r
       s   

z%QwenImageBlockWiseControlNet.__init__c                 C   s   t j |d d d df |d d d df< d}t jj|d||f}|j}||d |d |d d d|d d d}|	dddddd	}|
|d |d d |d d  |d d }| |S )
N         r            )comfylatent_formatsWan21
process_inldm
common_ditpad_to_patch_sizeshapeviewpermutereshaper2   )r   latent_image
patch_sizehidden_states
orig_shaper   r   r   process_input_latent_image2   s   0,.
z7QwenImageBlockWiseControlNet.process_input_latent_imagec                 C   s   | j | ||S r   )r5   )r   imgcontrolnet_conditioningblock_idr   r   r   control_block<   s   z*QwenImageBlockWiseControlNet.control_block)r&   r'   r   r   NNN)r    r!   r"   r#   r
   rM   rQ   r$   r   r   r   r   r%      s"    
r%   c                       sz   e Zd ZdZ						dded	ed
ededef
 fddZdd Zdej	de
jde
jde
jdejdej	fddZ  ZS )SigLIPMultiFeatProjModela  
    SigLIP Multi-Feature Projection Model for processing style features from different layers
    and projecting them into a unified hidden space.

    Args:
        siglip_token_nums (int): Number of SigLIP tokens, default 257
        style_token_nums (int): Number of style tokens, default 256
        siglip_token_dims (int): Dimension of SigLIP tokens, default 1536
        hidden_size (int): Hidden layer size, default 3072
        context_layer_norm (bool): Whether to use context layer normalization, default False
      r'     r   TNsiglip_token_numsstyle_token_numssiglip_token_dimshidden_sizecontext_layer_normc	           	         s   t    t|||t | _|r||nt | _	|j||dd| _
t|||t | _|r;||nt | _|j||dd| _t|||t | _|r]||nt | _|j||dd| _d S )NT)bias)r	   r
   r   
Sequentialr   SiLUhigh_embedding_linear	LayerNormIdentityhigh_layer_normhigh_projectionmid_embedding_linearmid_layer_normmid_projectionlow_embedding_linearlow_layer_normlow_projection)	r   rU   rV   rW   rX   rY   r   r   r   r   r   r   r
   M   s,   
	


z!SigLIPMultiFeatProjModel.__init__c                 C   sx   t | j j}| |d | j| j| j|}| |d | j| j| j	|}| |d | j
| j| j|}tj|||fddS )a  
        Forward pass function

        Args:
            siglip_outputs: Output from SigLIP model, containing hidden_states

        Returns:
            torch.Tensor: Concatenated multi-layer features with shape [bs, 3*style_token_nums, hidden_size]
        r7   r8   r   r   )nextr]   
parametersr   _process_layer_featuresr`   ra   rb   rc   rd   re   rf   rg   r   cat)r   siglip_outputsr   high_embeddingmid_embeddinglow_embeddingr   r   r   r   v   s.   
			z SigLIPMultiFeatProjModel.forwardrK   embedding_linear
layer_norm
projectionr   returnc                 C   s2   || |dddd}||}||}|S )a  
        Helper function to process features from a single layer

        Args:
            hidden_states: Input hidden states [bs, seq_len, dim]
            embedding_linear: Embedding linear layer
            layer_norm: Layer normalization
            projection: Projection layer
            dtype: Target data type

        Returns:
            torch.Tensor: Processed features [bs, style_token_nums, hidden_size]
        r8   r7   )to	transpose)r   rK   rq   rr   rs   r   	embeddingr   r   r   rk      s   z0SigLIPMultiFeatProjModel._process_layer_features)rS   r'   rT   r   TNNN)r    r!   r"   __doc__r#   boolr
   r   r   Tensorr   Moduler   rk   r$   r   r   r   r   rR   @   sB    )*rR   c           	      C   s   ddddd}i }t |  D ]F}| | }|}|dr |g}q|dr+|g| }q|drB||g }tj|d	d
}|dd}| D ]
\}}|||}qF|||< q|S )Nz.attention.out.biasz.attention.k_norm.weightz.attention.q_norm.weightz.attention.out.weight)z.attention.to_out.0.biasz.attention.norm_k.weightz.attention.norm_q.weightz.attention.to_out.0.weightz.attention.to_k.weightz.attention.to_q.weightz.attention.to_v.weightr   rh   z.attention.qkv.weight)sortedkeysendswithr   rl   replaceitems)	sdreplace_keysout_sdkwk_outccrrrr   r   r   z_image_convert   s.   





r   c                   @   0   e Zd Zedd ZdZdZdZdZdd Z	d	S )
ModelPatchLoaderc                 C   s   ddt dfiiS )Nrequirednamemodel_patches)folder_pathsget_filename_listsr   r   r   INPUT_TYPES   s   zModelPatchLoader.INPUT_TYPESMODEL_PATCHload_model_patchTzadvanced/loadersc           
   
   C   s  t d|}tjj|dd}tj|}d|v r/|d jd d }t|tj	 |tj
jd}nd	|v rKtjj|d
didd}ttj	 |tj
jd}nxd|v rt|}i }d|vred|d< d|d< d|d< d|v rd|d< d|d< d|d< |dd }|d urt|dkrd|d< tjjjjd#tj	 |tj
jd|}n$d|v rtddd|d jd |d jd |d jd tj	 tj
jd }tjj|tj tj	 d!}	|j||	 d" |	fS )$Nr   T)	safe_loadz controlnet_blocks.0.y_rms.weightzimg_in.weightr8   r'   )r*   r   r   r   z$feature_embedder.mid_layer_norm.biaszfeature_embedder. )filter_keysr,   z!control_all_x_embedder.2-1.weightz*control_layers.4.adaLN_modulation.0.weightr<   n_control_layers   r*   refiner_controlz+control_layers.14.adaLN_modulation.0.weight   z)control_noise_refiner.0.after_proj.weightr   brokenzaudio_proj.proj1.weightr=       r;   z%blocks.0.audio_cross_attn.proj.weightzaudio_proj.norm.weight)audio_windowcontext_tokens	vae_scaler)   intermediate_dimout_dimr   r   )load_deviceoffload_device)assignr   )r   get_full_path_or_raiser>   utilsload_torch_fileweight_dtyperE   r%   model_managementunet_offload_deviceopsmanual_caststate_dict_prefix_replacerR   r   getr   count_nonzerorB   lumina
controlnetZImage_ControlMultiTalkModelPatchmodel_patcherCoreModelPatcherget_torch_deviceload_state_dict
is_dynamic)
r   r   model_patch_pathr   r   r*   modelconfig
ref_weightr   r   r   r   r      sJ   *z!ModelPatchLoader.load_model_patchN)
r    r!   r"   classmethodr   RETURN_TYPESFUNCTIONEXPERIMENTALCATEGORYr   r   r   r   r   r      s    
r   c                   @   s6   e Zd ZdddZdd Zdd Zdd	 Zd
d ZdS )DiffSynthCnetPatchNc                 C   sL   || _ || _|| _|| _|| _|j| || _|j	d |j	d f| _
d S )Nr8   r7   )model_patchvaeimagestrengthmaskr   rM   encode_latent_condencoded_imagerE   encoded_image_size)r   r   r   r   r   r   r   r   r   r
     s   zDiffSynthCnetPatch.__init__c                 C   s   | j |}| jjjdkrD| jd u r%t|d d d | jjjd f }ntj	
| jjddd|jd |jd dd	}tj||gdd
S |S )Nr   r;   r8   Tr   keepdimr:   r9   bilinearnonerh   )r   encoder   r   r*   r   r   	ones_liker>   r   common_upscalemeanrE   rl   )r   r   rI   mask_r   r   r   r     s   
&,z%DiffSynthCnetPatch.encode_latent_condc              
   C   s:  | d}| d}| d}| j }| jd u s*| j|jd | |jd | fkrjtj| j	
dd|jd | |jd | dd}tjjd	d
}| jj| |
dd| _|jd |jd f| _tj| |d d d | jjd f  | jj|d d d | jjd f | j|j|| j 7  < ||d< |S )Nr   rN   block_indexr9   r:   r8   areacenterTonly_currently_used)r   r   spacial_compression_encoder   r   rE   r>   r   r   r   movedimr   loaded_modelsr   r   rM   r   load_models_gpurQ   ru   r   r   )r   kwargsr   rN   r   spacial_compressionimage_scaledr   r   r   r   __call__)  s   



,2ZzDiffSynthCnetPatch.__call__c                 C      t |tjr| j|| _| S r   
isinstancer   r   r   ru   r   device_or_dtyper   r   r   ru   9     zDiffSynthCnetPatch.toc                 C      | j gS r   r   r   r   r   r   models>     zDiffSynthCnetPatch.modelsr   r    r!   r"   r
   r   r   ru   r   r   r   r   r   r     s    
	r   c                   @   s8   e Zd ZdddZdddZdd Zdd	 Zd
d ZdS )ZImageControlPatchNc                 C   s   || _ || _|| _|| _|| _|| _| j jjdk| _d}| jd ur0| jd ur0| jj	| jj	kr0d}|r6d | _
n)| | j| j| _
| jd u rR| jj	d | jj	d f| _n| jj	d | jj	d f| _d | _d S )Nr   FTr8   r7   )r   r   r   inpaint_imager   r   r   r*   
is_inpaintrE   r   r   r   	temp_data)r   r   r   r   r   r   r   skip_encodingr   r   r   r
   B  s$   

zZImageControlPatch.__init__c              	   C   s  d }|d urt j | j|}| jr|d u r t|d }| j	d ur[t j
| j	| j	jd d| j	jd | j	jd jddd|jd |jd d	d
}|d |dd  d }t j | j|}| j	d u rzt|d d d df }n-t j
| j	| j	jd d| j	jd | j	jd jdddj|jd|jd |jd dd
}|d u rt j | jt|d }tj|||gddS |S )Ng      ?r   r:   r9   r8   Tr   r   r   )r   nearestrh   )r>   r?   FluxrA   r   r   r   r   r   r   r   r   rF   rE   r   r   round
zeros_likeru   r   rl   )r   control_imager   rI   mask_inpaintinpaint_image_latentr   r   r   r   r   Z  s"   
P
Z"z%ZImageControlPatch.encode_latent_condc              
   C   s  | d}| d}| d}| d}| d}| d}| d}| dd	}	| j }
| jd u sD| j|jd
 |
 |jd |
 fkrd }| jd urstj	| j
dd|jd |
 |jd
 |
 dd
dd}|jd |jd
 f| _d }| jd urtj	| j
dd|jd |
 |jd
 |
 dd
dd}|jd |jd
 f| _tjjdd}| ||| _tj| | jjj}td| }|| }|| }|d |d ||d krd | _|S | jd u s| jd |kr|	dkrdd | j|| j|j||ff| _ndd | j|| j|j||ff| _|	dkrq| jd d }|| jj|| jd d |d d d | jd d jd f d ||f| _| jd d d uro|d d d | jd d jd f  | jd d | j 7  < |S | jd |k r| jd d |k r| jd d }|| jj|| jd d |d d d | jd d jd f d ||f| _| jd |k r| jd d |k s|| jd kr|d d d | jd d jd f  | jd d | j 7  < || jd d krd | _|S )Nr   rN   	img_inputtxtpevecr   
block_typer   r9   r:   r8   r   r   r   Tr      r   noise_refiner)r   r   r   r   r   rE   r   r>   r   r   r   r   r   r   r   r   r   r   r   r   popr   ru   r   forward_noise_refiner_blockr   forward_control_block)r   r   r   rN   r   r   r   r  r   r  r   r   inpaint_scaledr   cnet_blocksdiv
cnet_indexcnet_index_float
next_layerr   r   r   r   u  sb   







,
:
:



(&
H<$H$<zZImageControlPatch.__call__c                 C   s.   t |tjr| jd ur| j|| _d | _| S r   )r   r   r   r   ru   r   r   r   r   r   ru     s
   
zZImageControlPatch.toc                 C   r   r   r   r   r   r   r   r     r   zZImageControlPatch.models)NNr   r   r   r   r   r   A  s    

=r   c                   @   s2   e Zd Zedd ZdZdZdZdZdd	d
Z	dS )QwenImageDiffsynthControlnetc              
   C   s(   ddddddddd	d
fdddidS )NMODELr   VAEIMAGEFLOAT      ?      $      $@{Gz?defaultminmaxstep)r   r   r   r   r   r   MASKr   optionalr   r   r   r   r   r     s   z(QwenImageDiffsynthControlnet.INPUT_TYPESr  diffsynth_controlnetTzadvanced/loaders/qwenNr  c           
      C   s   |  }|d ur|d d d d d d d df }|d ur,|d d d d d d d df }|d urH|jdkr:|d}|jdkrD|d}d| }t|jtjjjj	rit
||||||d}	||	 ||	 |fS |t||||| |fS )Nr<   r8   r;   r7   r  )r   r   )clonendim	unsqueezer   r   r>   rB   r   r   r   r   set_model_noise_refiner_patchset_model_double_block_patchr   )
r   r   r   r   r   r   r   r   model_patchedpatchr   r   r   r#    s$     





z1QwenImageDiffsynthControlnet.diffsynth_controlnet)Nr  NN)
r    r!   r"   r   r   r   r   r   r   r#  r   r   r   r   r    s    
r  c                   @   s   e Zd Zedd ZdZdS )ZImageFunControlnetc              	   C   s*   ddddddddd	fd
dddddS )Nr  r   r  r  r  r  r  r  r  )r   r   r   r   r  r  )r   r   r   r!  r   r   r   r   r   r     s   
zZImageFunControlnet.INPUT_TYPESzadvanced/loaders/zimageN)r    r!   r"   r   r   r   r   r   r   r   r+    s    
r+  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
UsoStyleProjectorPatchc                 C   s   || _ || _d S r   )r   r   )r   r   r   r   r   r   r
     s   
zUsoStyleProjectorPatch.__init__c                 C   s   | d}| d}| j| j|j|j}tj||gdd}||d< tjtj|j	d |j	d d|j|j
d|gdd|d< |S )Ntxt_idsr   r8   rh   r   r<   )r   r   )r   r   r   r   ru   r   r   rl   zerosrE   r   )r   r   r-  r   siglip_embeddingr   r   r   r     s   

6zUsoStyleProjectorPatch.__call__c                 C   r   r   r   r   r   r   r   ru     r   zUsoStyleProjectorPatch.toc                 C   r   r   r   r   r   r   r   r      r   zUsoStyleProjectorPatch.modelsN)r    r!   r"   r
   r   ru   r   r   r   r   r   r,    s
    	r,  c                   @   r   )
USOStyleReferencec                 C   s   dddddiS )Nr   r  r   )CLIP_VISION_OUTPUT)r   r   clip_vision_outputr   r   r   r   r   r     s   zUSOStyleReference.INPUT_TYPESr  apply_patchTzadvanced/model_patches/fluxc                 C   sL   t |jd d df |jd d df |jf}| }|t|| |fS )Nii)r   stackall_hidden_statespenultimate_hidden_statesr$  set_model_post_input_patchr,  )r   r   r   r2  r   r)  r   r   r   r3    s   .zUSOStyleReference.apply_patchN)
r    r!   r"   r   r   r   r   r   r   r3  r   r   r   r   r0    s    
r0  c                       sL   e Zd Z								dd	ed
edededededef fddZ  ZS )r   r=            r   r;   (   Nr   r   r)   r   r   r   r(   c              
      sX   t    t||| d || d| _tj fddt|D | _d S )Nr8   )seq_len
seq_len_vfr   r   r   r   r   r   c              	      s   g | ]}t  d qS r+   )r   r-   r   r   r)   r   r   r   r   r1   1  s    z0MultiTalkModelPatch.__init__.<locals>.<listcomp>)	r	   r
   r   
audio_projr   r   r3   r4   blocks)r   r   r   r)   r   r   r   r(   r   r   r   r   r>  r   r
     s    



zMultiTalkModelPatch.__init__)
r=   r8  r9  r:  r   r;   r;  NNN)r    r!   r"   r#   r
   r$   r   r   r   r   r     s0    r   )r   r  r+  r0  )r   r   r   comfy.utilsr>   	comfy.opscomfy.model_managementcomfy.ldm.common_ditcomfy.latent_formatscomfy.ldm.lumina.controlnetcomfy.ldm.wan.model_multitalkr   r   r{   r   r%   rR   r   r   r   r   r  r+  r,  r0  r   NODE_CLASS_MAPPINGSr   r   r   r   <module>   s8    ! 4.{%!
