
    f j?0                     t   U d Z ddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
  ej        d          j        Z ej        d          j        ZdZi Zeeef         ed<    e	j                    D ]+Z eej                  dk    rej        d         eej        <   ,d	ed
eeeef         eeef         f         fdZej        d
eeeef         eeef         f         fd            Zd
eeef         fdZ deeef         d
eee!eedz  eef                  f         fdZ"ej        d
eee!eedz  eef                  f         fd            Z#ded
edz  fdZ$ded
e%fdZ&d
eeef         fdZ'ej        d
e(fd            Z) G d d          Z*	 dde*de(ez  ded
efdZ+	 dd	edede*dz  d
eeedz  f         fdZ,dS ) zModel loading and bigram scoring utilities.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)REGISTRYlookup_encodingz>Iz>ds   CMD2_SINGLE_LANG_MAP   datareturnc                    	 | dd         t           k    rd}t          |          d}t          | |          \  }|dz  }|dk    rd| d}t          |          g }i }t          |          D ]}t          | |          \  }|dz  }|dk    rd| d	}t          |          | |||z                                d
          }||z  }t          | |          \  }	|dz  }|                    |           |	||<   t          j        | |d                   }
|dz  }t          |
          |k    r$dt          |
           d| }t          |          t          |
          }i }t          |          D ]\  }}|dz  }|||dz            ||<   nZ# t          j        $ r}d| }t          |          |d}~wt          j        t          f$ r}d| }t          |          |d}~ww xY w||fS )zParse the v2 dense zlib-compressed models.bin format.

    :param data: Raw bytes of models.bin (must be non-empty).
    :returns: A ``(models, norms)`` tuple.
    :raises ValueError: If the data is corrupt or truncated.
    N   z&corrupt models.bin: missing CMD2 magici'  zcorrupt models.bin: num_models=z exceeds limit   zcorrupt models.bin: name_len=z exceeds 256zutf-8      z&corrupt models.bin: decompressed size z != expected zcorrupt models.bin: )	_V2_MAGIC
ValueError_unpack_uint32rangedecode_unpack_float64appendzlib
decompresslen
memoryview	enumerateerrorstructUnicodeDecodeError)r   msgoffset
num_modelsnamesnorms_name_lennamenormblobexpected_sizemvmodelsistartes                    kC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\venv\Lib\site-packages\chardet/models/__init__.py_parse_models_binr.      s   6%8y  	":CS//!&tV44! 	"NJNNNCS//!"$z"" 	 	A(v66KXaKF#~ &LhLLL oo%(!223::7CCDhF%dF33GTaKFLLE$KK tFGG}--"U*t99% 	"/T / /,/ /  S//! (* '' 	5 	5GAtIEeeem34F4LL	5 : % % %(Q((oo1$L,- % % %(Q((oo1$% 5=s$   F	F G#F00G#	GG#c                      t           j                            d                              d          } |                                 }|s t          j        dt          d           i i fS t          |          S )zkLoad and parse models.bin, returning (models, norms).

    Cached: only reads from disk on first call.
    chardet.modelsz
models.binuX   chardet models.bin is empty — statistical detection disabled; reinstall chardet to fix   
stacklevel)		importlib	resourcesfilesjoinpath
read_byteswarningswarnRuntimeWarningr.   refr   s     r-   _load_models_datar>   a   s}     

#
#$4
5
5
>
>|
L
LC>>D '		
 	
 	
 	
 2vT"""    c                  *    t                      d         S )zLoad all bigram models from the bundled models.bin file.

    Each model is a memoryview of length 65536 (256*256).
    Index: (b1 << 8) | b2 -> weight (0-255).

    :returns: A dict mapping model key strings to 65536-byte lookup tables.
    r   r>    r?   r-   load_modelsrC   v   s     q!!r?   r)   c                 ,   i }|                                  D ]J\  }}|                    dd          \  }}|                    |g                               |||f           Kt	          |          D ]"}t          |          }|||vr||         ||<   #|S )zBuild a grouped index from a models dict.

    :param models: Mapping of ``"lang/encoding"`` keys to 65536-byte tables.
    :returns: Mapping of encoding name to ``[(lang, model, model_key), ...]``.
    /r   )itemssplit
setdefaultr   listr   )r)   indexkeymodellangencenc_name	canonicals           r-   _build_enc_indexrQ      s     BDEllnn = =
UIIc1%%	cb!!(($s);<<<< KK / /#H--	 	/Ye%; 	/$XE)Lr?   c                  8    t          t                                S )zTReturn a pre-grouped index mapping encoding name -> [(lang, model, model_key), ...].)rQ   rC   rB   r?   r-   get_enc_indexrS      s     KMM***r?   encodingc                 6    t                               |           S )zReturn the language for a single-language encoding, or None.

    :param encoding: The canonical encoding name.
    :returns: An ISO 639-1 language code, or ``None`` if the encoding is
        multi-language.
    )r   getrT   s    r-   infer_languagerX      s     )))r?   c                 "    | t                      v S )zReturn True if the encoding has language variants in the model index.

    :param encoding: The canonical encoding name.
    :returns: ``True`` if bigram models exist for this encoding.
    )rS   rW   s    r-   has_model_variantsrZ      s     }&&r?   c                  *    t                      d         S )zAReturn cached L2 norms for all models, keyed by model key string.r   rA   rB   r?   r-   _get_model_normsr\      s    q!!r?   c                  P   t           j                            d                              d          } |                                 }t          |          dk    r?t          j        dt          |           dt          d           t          ddz            S t          |          S )	u  Return a 65536-byte IDF weight table for bigram profile construction.

    Loads a precomputed table from ``idf.bin`` (generated at training time).
    For each bigram index, the weight reflects how discriminative that bigram
    is across all models:

    - Bigrams in every model (common ASCII) → weight 1 (minimal signal)
    - Bigrams in one model → weight 255 (maximum signal)
    - Bigrams not in any model → weight 1 (unknown, treat as neutral)
    r0   zidf.binr   z chardet idf.bin has wrong size (z"), falling back to uniform weightsr1   r2      )
r4   r5   r6   r7   r8   r   r9   r:   r;   	bytearrayr<   s     r-   get_idf_weightsr`      s     

#
#$4
5
5
>
>y
I
IC>>D
4yyE *.s4yy . . .		
 	
 	
 	
 5)))T??r?   c                   V    e Zd ZdZdZdeddfdZedee	e	f         dd fd            Z
dS )	BigramProfileu  Pre-computed bigram frequency distribution for a data sample.

    Computing this once and reusing it across all models reduces per-model
    scoring from O(n) to O(distinct_bigrams).

    Stores a dense ``freq`` list of length 65536 indexed by bigram index, plus
    a ``nonzero`` list of indices with non-zero frequency for fast iteration.
    Each bigram is weighted by its IDF (inverse document frequency) across all
    models — bigrams unique to few models get high weight, bigrams common to
    all models get weight 1.
    )freq
input_normnonzero
weight_sumr   r   Nc                    t          |          dz
  }|dk    rg | _        g | _        d| _        d| _        dS t                      }dgdz  }g }d}t          |          D ]W}||         dz  ||dz            z  }||         }	||         dk    r|                    |           ||xx         |	z  cc<   ||	z  }X|| _        || _        || _        d}
|D ]}||         }|
||z  z  }
t          j	        |
          | _        dS )a?  Compute the bigram frequency distribution for *data*.

        Each bigram is weighted by its IDF (inverse document frequency) across
        all loaded models.  Bigrams unique to few models get high weight;
        bigrams common to all models get weight 1.

        :param data: The raw byte data to profile.
        r   r           Nr   r   )
r   rc   re   rf   rd   r`   r   r   mathsqrt)selfr   total_bigramsidfrc   re   w_sumr*   idxwnorm_sqvs               r-   __init__zBigramProfile.__init__   s6    D		AA 	 $&DI&(DL#$DO%(DOF#+}%% 	 	A7a<4A;.CCACyA~ $s###IIINIIIQJEE	 	 	CS	Aq1uGG)G,,r?   weighted_freqc                     | d          }dgdz  }g }|                                 D ]!\  }}|||<   |r|                    |           "||_        ||_        t	          |                                          |_        t          j        t	          d |                                D                                 |_	        |S )aL  Create a BigramProfile from pre-computed weighted frequencies.

        Computes ``weight_sum`` and ``input_norm`` from *weighted_freq* to
        ensure consistency between the stored fields.

        :param weighted_freq: Mapping of bigram index to weighted count.
        :returns: A new :class:`BigramProfile` instance.
        r?   r   r   c              3       K   | ]	}||z  V  
d S NrB   ).0rr   s     r-   	<genexpr>z3BigramProfile.from_weighted_freq.<locals>.<genexpr>  s&      *Q*QQ1q5*Q*Q*Q*Q*Q*Qr?   )
rF   r   rc   re   sumvaluesrf   ri   rj   rd   )clsrt   profilerc   re   ro   counts          r-   from_weighted_freqz BigramProfile.from_weighted_freq  s     #c((#+'--// 	$ 	$JCDI $s###! !5!5!7!788!Ys*Q*Q-:N:N:P:P*Q*Q*Q'Q'QRRr?   )__name__
__module____qualname____doc__	__slots__bytesrs   classmethoddictintr   rB   r?   r-   rb   rb      s        
 
 @I&-U &-t &- &- &- &-P tCH~ /    [  r?   rb    r}   rL   	model_keyc                 Z   | j         dk    rdS t                      }|r|                    |          nd}|:d}t          d          D ]}||         }|r|||z  z  }t	          j        |          }|dk    rdS d}| j        }	| j        D ]}
|||
         |	|
         z  z  }||| j         z  z  S )zSScore a pre-computed bigram profile against a single model using cosine similarity.rh   Nr   r   )rd   r\   rV   r   ri   rj   rc   re   )r}   rL   r   r!   
model_normsq_sumr*   rr   dotrc   ro   s              r-   score_with_profiler     s     S  sE)2<9%%%J 'u 	  	 AaA  !a%Yv&&
S s
C<D & &uSzDI%%*w1122r?   c                     | s|dS t                      }|                    |          }|dS |t          |           }d}d}|D ]!\  }}}	t          |||	          }
|
|k    r|
}|}"||fS )a  Score data against all language variants of an encoding.

    Returns (best_score, best_language). Uses a pre-grouped index for O(L)
    lookup where L is the number of language variants for the encoding.

    If *profile* is provided, it is reused instead of recomputing the bigram
    frequency distribution from *data*.

    :param data: The raw byte data to score.
    :param encoding: The canonical encoding name to match against.
    :param profile: Optional pre-computed :class:`BigramProfile` to reuse.
    :returns: A ``(score, language)`` tuple with the best cosine-similarity
        score and the corresponding language code (or ``None``).
    N)rh   Nrh   )rS   rV   rb   r   )r   rT   r}   rJ   variants
best_score	best_langrM   rL   r   ss              r-   score_best_languager   6  s    &  G yOOEyy""H y &%%J I"*  eYwy99z> 	JIy  r?   )r   rw   )-r   	functoolsimportlib.resourcesr4   ri   r   r9   r   chardet.registryr   r   Structunpack_fromr   r   r   r   r   str__annotations__r{   _encr   	languagesr$   r   tupler   floatr.   cacher>   rC   rI   rQ   rS   rX   boolrZ   r\   r_   r`   rb   r   r   rB   r?   r-   <module>r      s                 6 6 6 6 6 6 6 6t$$0&-%%1	 $& $sCx. % % %HO 8 8D
s4>a 8&*nQ&7#A
A
4Z $sEz"223A A A AH #5c:o!6S%Z8H!HI # # # #("T#z/* " " " "j!	#tE#*j#567
78   . +tCeC$J
C,G&H!IIJ + + + +
*S *S4Z * * * *' ' ' ' ' '"$sEz* " " " "
     0L L L L L L L L` MO3 33#,z#93FI3
3 3 3 36 %)&! &!
&!&! T!&! 5#*	&! &! &! &! &! &!r?   