o
    j)                     @   s  U d Z ddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZ ddlmZ eeeef eee eeeeef f f f Zi ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4Zeeef ed5< d6d7 e D Zeeef ed8< d9ed:efd;d<Zejd:efd=d>Zi ddddd
dddddddddddddddd dd"dd$dd&dd'dd)dd(ddddddd	dddddddd?Zeeef ed@< d9edAedBedCee dDeeeeef f d:edB fdEdFZdGedHeeeeedB e ef  f dIed:e!fdJdKZ"d9edAedBedCee d:edB f
dLdMZ#dNedAedBed:eeef dB fdOdPZ$dQZ%d9edRee d:ee fdSdTZ&dS )Ua  Confusion group resolution for similar single-byte encodings.

At runtime, loads pre-computed distinguishing byte maps from confusion.bin
and uses them to resolve statistical scoring ties between similar encodings.

Build-time computation (``compute_confusion_groups``, ``compute_distinguishing_maps``,
``serialize_confusion_data``) lives in ``scripts/confusion_training.py``.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)BigramProfileget_enc_indexget_idf_weightsscore_with_profile)DetectionResult)lookup_encodingLu   Ll   Lt   Lm   Lo   Mn   Mc   Me   Nd	   Nl
   No   Pc   Pd   Ps   Pe   Pi   PfPoSmScSkSoZsZlZpCcCfCsCoCn)                                       _INT_TO_CATEGORYc                 C   s   i | ]\}}||qS  rD   ).0kvrD   rD   `/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/pipeline/confusion.py
<dictcomp>G   s    rI   _CATEGORY_TO_INTdatareturnc                 C   s"  i }d}t d| |\}|d7 }t|D ]z}t d| |\}|d7 }| |||  d}||7 }t d| |\}|d7 }| |||  d}||7 }t d| |\}	|d7 }g }
i }t|	D ]#}t d| |\}}}|d7 }|
| t|d	t|d	f||< q`t|
|f|||f< q|S )
zLoad confusion group data from raw bytes.

    :param data: The raw binary content of a confusion.bin file.
    :returns: A :data:`DistinguishingMaps` dictionary keyed by encoding pairs.
    r   z!Hr   z!Br	   zutf-8z!BBBr   r5   )structunpack_fromrangedecodeappendrC   get	frozenset)rK   resultoffset	num_pairs_
name_a_lenname_a
name_b_lenname_b	num_diffsdiff_bytes_list
categoriesbv	cat_a_int	cat_b_intrD   rD   rH   %deserialize_confusion_data_from_bytesJ   s4   



rb   c               
   C   s   t jdd} |  }|stjdtdd i S zt|}W n t	j
tfy8 } z
d| }t||d}~ww i }| D ]\\}}}t|pJ|}	t|pP|}
|||	|
f< q?|S )zLoad confusion group data from the bundled confusion.bin file.

    :returns: A :data:`DistinguishingMaps` dictionary keyed by encoding pairs.
    zchardet.modelszconfusion.binuZ   chardet confusion.bin is empty — confusion resolution disabled; reinstall chardet to fixr   )
stacklevelzcorrupt confusion.bin: N)	importlib	resourcesfilesjoinpath
read_byteswarningswarnRuntimeWarningrb   rM   errorUnicodeDecodeError
ValueErroritemsr   )refrawraw_mapsemsg
normalizedabvaluenorm_anorm_brD   rD   rH   load_confusion_datar   s,   

r{   )r,   r-   r.   r/   r0   r2   r1   r4   r3   r5   r   r   r   _CATEGORY_PREFERENCEenc_aenc_b
diff_bytesr^   c                 C   s   d}d}t | |@ }|sdS |D ])}|| \}	}
t|	d}t|
d}||kr/||| 7 }q||kr9||| 7 }q||kr@|S ||krF|S dS )a  Resolve between two encodings using Unicode category voting.

    For each distinguishing byte present in the data, compare the Unicode
    general category under each encoding. The encoding whose interpretation
    has the higher category preference score gets a vote. The encoding with
    more votes wins.

    :param data: The raw byte data to examine.
    :param enc_a: First encoding name.
    :param enc_b: Second encoding name.
    :param diff_bytes: Byte values where the two encodings differ.
    :param categories: Mapping of byte value to ``(cat_a, cat_b)`` Unicode
        general category pairs.
    :returns: The winning encoding name, or ``None`` if tied.
    r   N)rS   r|   rR   )rK   r}   r~   r   r^   votes_avotes_brelevantr_   cat_acat_bpref_apref_brD   rD   rH   resolve_by_category_voting   s&   r   profileindexencc                    s(   | |}|s	dS t fdd|D S )zDReturn the best bigram score across all language variants for *enc*.g        c                 3   s"    | ]\}}}t  ||V  qd S )N)r   )rE   rW   model	model_keyr   rD   rH   	<genexpr>   s
    

z&_best_variant_score.<locals>.<genexpr>)rR   max)r   r   r   variantsrD   r   rH   _best_variant_score   s   
r   c                 C   s   t | dk rdS t }i }tt | d D ]'}| | }| |d  }||vr*||vr*q|d> |B }	||	d||	  ||	< q|sAdS t|}
t }t|
||}t|
||}||kr[|S ||kra|S dS )a  Resolve between two encodings by re-scoring only distinguishing bigrams.

    Builds a focused bigram profile containing only bigrams where at least one
    byte is a distinguishing byte, then scores both encodings against their
    best language model.

    :param data: The raw byte data to examine.
    :param enc_a: First encoding name.
    :param enc_b: Second encoding name.
    :param diff_bytes: Byte values where the two encodings differ.
    :returns: The winning encoding name, or ``None`` if tied.
    r   Nr	   r   r   )lenr   rO   rR   r   from_weighted_freqr   r   )rK   r}   r~   r   idffreqib1b2idxr   r   best_abest_brD   rD   rH   resolve_by_bigram_rescore   s,   
r   mapsc                 C   s,   ||f| v r
||fS ||f| v r||fS dS )zEFind the canonical key for a pair of encodings in the confusion maps.NrD   )r   r}   r~   rD   rD   rH   _find_pair_key  s
   r   g{Gzt?resultsc                    s"  t |dk r|S |d }|jdu r|S t }|j}tdt |D ]n |  }|jdu r,q  dkr:||j tkr: |S t||j|j}|du rGq || \}}|\}	}
t| |	|
||}t| |	|
|}|durf|n|}|dur||jkrt	|j|j|j
|j} fddt|D }|g|  S q |S )a  Resolve confusion between similar encodings in the top results.

    Checks the top result against each candidate within a confidence band.
    Always checks position 1 (preserving original top-2 behavior); for
    positions 2+ only checks within the band.  Uses bigram re-scoring
    with category voting as fallback.

    :param data: The raw byte data to examine.
    :param results: Detection results sorted by confidence descending.
    :returns: A reordered list of :class:`DetectionResult` with the winner first.
    r   r   Nr	   c                    s   g | ]
\}}| kr|qS rD   rD   )rE   jrr   rD   rH   
<listcomp>f  s    z,resolve_confusion_groups.<locals>.<listcomp>)r   encodingr{   
confidencerO   _CONFUSION_BANDr   r   r   r   language	mime_type	enumerate)rK   r   topr   top_conf	candidatepair_keyr   r^   r}   r~   
cat_winnerbigram_winnerwinnerpromotedrestrD   r   rH   resolve_confusion_groups.  sF   


r   )'__doc__	functoolsimportlib.resourcesrd   rM   ri   chardet.modelsr   r   r   r   chardet.pipeliner   chardet.registryr   dicttuplestrrS   intDistinguishingMapsrC   __annotations__ro   rJ   bytesrb   cacher{   r|   r   list
memoryviewfloatr   r   r   r   r   rD   rD   rD   rH   <module>   sJ   
	
""(	
"
*

/
