o
    ¶ÏiŒ$  ã                   @   sn   d dl Z ddd„Zddd„Zddd„Zdd	„ Zddede jfdd„Zdd„ Zddd„Z	ddd„Z
ddd„ZdS )é    Nc                 C   sj   t  || d||   d d|  | d| d |   ¡}|t j| ¡ |j|j|j|d7 }| ¡ d|  S )Nç       @ç      ð?é   é   ©ÚdtypeÚlayoutÚdeviceÚ	generator)ÚtorchÚwhereÚrandÚsizer   r   r	   Úfloor)Úabs_xÚexponentÚnormal_maskÚMANTISSA_BITSÚEXPONENT_BIASr
   Úmantissa_scaled© r   ú)/mnt/c/Users/fbmor/ComfyUI/comfy/float.pyÚcalc_mantissa   s   ý"r   c                 C   s   |t jkrd\}}}n|t jkrd\}}}ntdƒ‚|  ¡ } t  | ¡}|  ¡ }t  |dkd|¡}t  t  	t  
|¡¡| dd| d ¡}|dk }	t|||	|||d|d d …< |t  |	d||  d	|  d| d  | ¡9 }t  |¡}
t j||
j|
j|d
 |S )N)é   é   é   )é   r   é   zUnsupported dtyper   r   r   ©r
   r   r   )ÚminÚmaxÚout)r   Úfloat8_e4m3fnÚfloat8_e5m2Ú
ValueErrorÚhalfÚsignÚabsr   Úclampr   Úlog2r   Úfinfor   r    )Úxr   r
   ÚEXPONENT_BITSr   r   r&   r   r   r   Úinfr   r   r   Ú!manual_stochastic_round_to_float8   s.   


þ
ý
r.   c                 C   s   |t jkr| jt jdS |t jkr| jt jdS |t jkr$| jt jdS |t jks.|t jkrzt j| jd}| 	|¡ t j
| |d}td|  ¡ d ƒ}tdt| jd | ƒƒ}td| jd |ƒD ]}|||| …  t| ||| … ||d¡ q_|S | j|dS )N)r   ©r	   r   é   r   r   )r   Úfloat32ÚtoÚfloat16Úbfloat16r"   r#   Ú	Generatorr	   Úmanual_seedÚ
empty_liker    ÚnumelÚroundÚshapeÚrangeÚcopy_r.   )Úvaluer   Úseedr
   ÚoutputÚ
num_slicesÚ
slice_sizeÚir   r   r   Ústochastic_rounding2   s    



.rC   c           	      C   s:  | j }t | ¡ tj¡}t t |  ¡ ¡d ¡ dd¡}| tj	|  
¡ | j| j| j|dd d|d   d 7 } |  ¡ } t t | ¡d	 ¡ dd¡}tj|dk| d|d
   d d | d | d ¡  tj¡}~ | tj¡}|d> |d
> B |B }~~~| d¡}|dd d… d> |d
d d… B }| t|ƒd d… dg ¡S )Nr   r   r   r   g      à?r   r   g      ô?g®Gázó?r   )r!   éÿÿÿÿr   )r:   r   Úsignbitr2   Úuint8r   r)   r'   r(   r   r   r   r   r	   r   r9   ÚviewÚreshapeÚlist)	r+   r
   Ú
orig_shaper&   ÚexpÚmantissaÚfp4Úfp4_flatÚpackedr   r   r   Ústochastic_float_to_fp4_e2m1G   s*    6üû
 rP   TÚflattenÚreturnc                 C   sÌ   dd„ }| j \}}||dƒ}||dƒ}|d }|d }| }	||f||fkr;tj||f| j| jd}	| |	d|…d|…f< |	 |d|d¡ ddd	d
¡}
|
 dddd¡ d	d¡ ddd¡}|r`| 	¡ S | ||¡S )aY  
    Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
    See:
        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout

    Args:
        input_matrix: Input tensor of shape (H, W)
    Returns:
        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
    c                 S   s   | | d | S ©Nr   r   )ÚaÚbr   r   r   Úceil_divo   s   zto_blocked.<locals>.ceil_divé€   r   )r	   r   Nr   r   r   r   rD   é    é   )
r:   r   Úzerosr	   r   rG   ÚpermuterH   Ú	transposerQ   )Úinput_matrixrQ   rV   ÚrowsÚcolsÚn_row_blocksÚn_col_blocksÚpadded_rowsÚpadded_colsÚpaddedÚblocksÚ
rearrangedr   r   r   Ú
to_blockedc   s&   


ý"rg   c           	      C   sž   d}d}| j }d}|  |d d|¡} tjtjt | ¡dd| | | j¡ |d tj¡}| | | j¡| | j¡  	d¡ } |  
|¡ ¡ } t| |d}||fS )	Ng      @ç      |@rY   r   rD   ©Údim)r    r   )r:   rH   r   r(   Úamaxr'   r2   r   r"   Ú	unsqueezerG   Ú
nan_to_numrP   )	r+   Úper_tensor_scaler
   ÚF4_E2M1_MAXÚF8_E4M3_MAXrJ   Ú
block_sizeÚscaled_block_scales_fp8Údata_lpr   r   r   Ú%stochastic_round_quantize_nvfp4_blockŒ   s   6"rt   c                 C   s    dt dt dt fdd„}tj| jd}| |¡ |r@| j\}}||dƒ}||dƒ}	||ks0|	|kr@tjj | d|	| d|| f¡} t	| ||ƒ\} }
| t
|
d	d
fS )Nr+   ÚmultiplerR   c                 S   ó   | | d | | S ©z#Round up x to the nearest multiple.r   r   ©r+   ru   r   r   r   Úroundupž   ó   z0stochastic_round_quantize_nvfp4.<locals>.roundupr/   rY   r   F©rQ   )Úintr   r5   r	   r6   r:   ÚnnÚ
functionalÚpadrt   rg   )r+   rn   Úpad_16xr>   ry   r
   r^   r_   rb   rc   Úblocked_scaledr   r   r   Ústochastic_round_quantize_nvfp4   s   



 r‚   r0   c                 C   s„  dt dt dt fdd„}| j}|r:| j\}}||dƒ}	||dƒ}
|	|ks'|
|kr:tjj | d|
| d|	| f¡} | j}t|ƒ}tj|d d… |d d	 g tj| j	d
}tj|d d… |d d g tj
| j	d
}tj| j	d}| |¡ td|  ¡ | ƒ}tdt| jd | ƒƒ}td| jd |ƒD ]'}t| ||| … ||d\}}|||| …  |¡ |||| …  |¡ q’|t|ddfS )Nr+   ru   rR   c                 S   rv   rw   r   rx   r   r   r   ry   ²   rz   z9stochastic_round_quantize_nvfp4_by_block.<locals>.rounduprY   r   rD   r   )r   r	   r/   r   r   Fr{   )r|   r:   r   r}   r~   r   rI   ÚemptyrF   r	   r"   r5   r6   r    r8   r9   r;   rt   r<   rg   )r+   rn   r€   r>   rq   ry   rJ   r^   r_   rb   rc   Ú
output_fp4Úoutput_blockr
   r@   rA   rB   rM   Úblockr   r   r   Ú(stochastic_round_quantize_nvfp4_by_block±   s*   


 **
r‡   c                 C   sd  dd„ }|r-| j \}}||dƒ}||dƒ}||ks||kr-tjj | d|| d|| f¡} d}d}	d}
| j \}}|  |d|
¡}tjt |¡dd}tj| 	¡ | d	d
}t t 
t |¡¡ tj¡|	 dd¡}| tj¡}|dk}| tj¡d>  tj¡}t |t |¡|¡}| 	¡ | d¡  ||¡}t|tj|d}t |t |¡|¡}|t|dd tj¡fS )Nc                 S   rv   rS   r   )Úx_valru   r   r   r   ry   ×   s   z9stochastic_round_quantize_mxfp8_by_block.<locals>.rounduprX   r   rh   é   rD   ri   g       8)r   éþ   é   )r>   Fr{   )r:   r   r}   r~   r   rH   rk   r'   r(   ÚfloatÚceilr)   r2   Úint32rF   rG   r1   r   Ú	ones_likerl   rC   r"   Ú
zeros_likerg   Úfloat8_e8m0fnu)r+   Úpad_32xr>   ry   r^   r_   rb   rc   rp   Ú	E8M0_BIASÚ
BLOCK_SIZEÚ	x_blockedÚmax_absÚscale_neededÚ
exp_biasedÚblock_scales_e8m0Ú	zero_maskÚblock_scales_f32Údata_scaledÚ
output_fp8r   r   r   Ú(stochastic_round_quantize_mxfp8_by_blockÖ   s.   


 
&rž   )N)r   )T)r   r0   )r   r   r.   rC   rP   ÚboolÚTensorrg   rt   r‚   r‡   rž   r   r   r   r   Ú<module>   s    


$)

%