o
    žªjã.  ã                   @   s$  U d Z ddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
 e d¡jZe d¡jZdZi Zeeef ed< e	 ¡ D ]ZeejƒdkrPejd eej< q?d	ed
eeeef eeef f fdd„Zejd
eeeef eeef f fdd„ƒZd
eeef fdd„Z deeef d
eee!eedB eef  f fdd„Z"ejd
eee!eedB eef  f fdd„ƒZ#ded
edB fdd„Z$ded
e%fdd„Z&d
eeef fdd„Z'ejd
e(fdd„ƒZ)G dd „ d ƒZ*	!d)d"e*d#e(eB d$ed
efd%d&„Z+	d*d	eded"e*dB d
eeedB f fd'd(„Z,dS )+zÒModel loading and bigram scoring utilities.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
é    N)ÚREGISTRYÚlookup_encodingz>Iz>ds   CMD2Ú_SINGLE_LANG_MAPé   ÚdataÚreturnc              
   C   sÀ  z°| dd… t krd}t|ƒ‚d}t| |ƒ\}|d7 }|dkr)d|› d}t|ƒ‚g }i }t|ƒD ]<}t| |ƒ\}|d7 }|dkrKd|› d	}t|ƒ‚| ||| …  d
¡}||7 }t| |ƒ\}	|d7 }| |¡ |	||< q1t | |d… ¡}
|d }t	|
ƒ|krdt	|
ƒ› d|› }t|ƒ‚t
|
ƒ}i }t|ƒD ]\}}|d }|||d … ||< q™W ||fS  tjyÇ } z
d|› }t|ƒ|‚d}~w tjtfyß } z
d|› }t|ƒ|‚d}~ww )záParse the v2 dense zlib-compressed models.bin format.

    :param data: Raw bytes of models.bin (must be non-empty).
    :returns: A ``(models, norms)`` tuple.
    :raises ValueError: If the data is corrupt or truncated.
    Né   z&corrupt models.bin: missing CMD2 magici'  zcorrupt models.bin: num_models=z exceeds limité   zcorrupt models.bin: name_len=z exceeds 256zutf-8é   é   z&corrupt models.bin: decompressed size z != expected zcorrupt models.bin: )Ú	_V2_MAGICÚ
ValueErrorÚ_unpack_uint32ÚrangeÚdecodeÚ_unpack_float64ÚappendÚzlibÚ
decompressÚlenÚ
memoryviewÚ	enumerateÚerrorÚstructÚUnicodeDecodeError)r   ÚmsgÚoffsetÚ
num_modelsÚnamesÚnormsÚ_Úname_lenÚnameÚnormÚblobÚexpected_sizeÚmvÚmodelsÚiÚstartÚe© r+   ú]/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/models/__init__.pyÚ_parse_models_bin   s`   	

ÿÿþù

€

€þr-   c                  C   s>   t j d¡ d¡} |  ¡ }|stjdtdd i i fS t|ƒS )zkLoad and parse models.bin, returning (models, norms).

    Cached: only reads from disk on first call.
    úchardet.modelsz
models.binuX   chardet models.bin is empty â€” statistical detection disabled; reinstall chardet to fixé   ©Ú
stacklevel)	Ú	importlibÚ	resourcesÚfilesÚjoinpathÚ
read_bytesÚwarningsÚwarnÚRuntimeWarningr-   ©Úrefr   r+   r+   r,   Ú_load_models_dataa   s   ür<   c                   C   ó
   t ƒ d S )zòLoad all bigram models from the bundled models.bin file.

    Each model is a memoryview of length 65536 (256*256).
    Index: (b1 << 8) | b2 -> weight (0-255).

    :returns: A dict mapping model key strings to 65536-byte lookup tables.
    r   ©r<   r+   r+   r+   r,   Úload_modelsv   s   
r?   r'   c                 C   st   i }|   ¡ D ]\}}| dd¡\}}| |g ¡ |||f¡ qt|ƒD ]}t|ƒ}|dur7||vr7|| ||< q#|S )zËBuild a grouped index from a models dict.

    :param models: Mapping of ``"lang/encoding"`` keys to 65536-byte tables.
    :returns: Mapping of encoding name to ``[(lang, model, model_key), ...]``.
    ú/r   N)ÚitemsÚsplitÚ
setdefaultr   Úlistr   )r'   ÚindexÚkeyÚmodelÚlangÚencÚenc_nameÚ	canonicalr+   r+   r,   Ú_build_enc_index   s   €rL   c                   C   s
   t tƒ ƒS )zTReturn a pre-grouped index mapping encoding name -> [(lang, model, model_key), ...].)rL   r?   r+   r+   r+   r,   Úget_enc_index˜   s   
rM   Úencodingc                 C   s
   t  | ¡S )zÕReturn the language for a single-language encoding, or None.

    :param encoding: The canonical encoding name.
    :returns: An ISO 639-1 language code, or ``None`` if the encoding is
        multi-language.
    )r   Úget©rN   r+   r+   r,   Úinfer_languagež   s   
rQ   c                 C   s
   | t ƒ v S )z¾Return True if the encoding has language variants in the model index.

    :param encoding: The canonical encoding name.
    :returns: ``True`` if bigram models exist for this encoding.
    )rM   rP   r+   r+   r,   Úhas_model_variants¨   s   
rR   c                   C   r=   )zAReturn cached L2 norms for all models, keyed by model key string.r   r>   r+   r+   r+   r,   Ú_get_model_norms±   s   
rS   c                  C   sV   t j d¡ d¡} |  ¡ }t|ƒdkr'tjdt|ƒ› dtdd t	dd ƒS t	|ƒS )	uÎ  Return a 65536-byte IDF weight table for bigram profile construction.

    Loads a precomputed table from ``idf.bin`` (generated at training time).
    For each bigram index, the weight reflects how discriminative that bigram
    is across all models:

    - Bigrams in every model (common ASCII) â†’ weight 1 (minimal signal)
    - Bigrams in one model â†’ weight 255 (maximum signal)
    - Bigrams not in any model â†’ weight 1 (unknown, treat as neutral)
    r.   zidf.binr   z chardet idf.bin has wrong size (z"), falling back to uniform weightsr/   r0   ó   )
r2   r3   r4   r5   r6   r   r7   r8   r9   Ú	bytearrayr:   r+   r+   r,   Úget_idf_weights¶   s   ürV   c                   @   sD   e Zd ZdZdZdeddfdd„Zedee	e	f dd fd	d
„ƒZ
dS )ÚBigramProfileu  Pre-computed bigram frequency distribution for a data sample.

    Computing this once and reusing it across all models reduces per-model
    scoring from O(n) to O(distinct_bigrams).

    Stores a dense ``freq`` list of length 65536 indexed by bigram index, plus
    a ``nonzero`` list of indices with non-zero frequency for fast iteration.
    Each bigram is weighted by its IDF (inverse document frequency) across all
    models â€” bigrams unique to few models get high weight, bigrams common to
    all models get weight 1.
    )ÚfreqÚ
input_normÚnonzeroÚ
weight_sumr   r   Nc                 C   sè   t |ƒd }|dkrg | _g | _d| _d| _dS tƒ }dgd }g }d}t|ƒD ])}|| d> ||d  B }|| }	|| dkrE| |¡ ||  |	7  < ||	7 }q(|| _|| _|| _d}
|D ]}|| }|
|| 7 }
q_t 	|
¡| _dS )a?  Compute the bigram frequency distribution for *data*.

        Each bigram is weighted by its IDF (inverse document frequency) across
        all loaded models.  Bigrams unique to few models get high weight;
        bigrams common to all models get weight 1.

        :param data: The raw byte data to profile.
        r   r   ç        Nr   r
   )
r   rX   rZ   r[   rY   rV   r   r   ÚmathÚsqrt)Úselfr   Útotal_bigramsÚidfrX   rZ   Úw_sumr(   ÚidxÚwÚnorm_sqÚvr+   r+   r,   Ú__init__Þ   s4   	


zBigramProfile.__init__Úweighted_freqc                 C   sz   | dƒ}dgd }g }|  ¡ D ]\}}|||< |r| |¡ q||_||_t| ¡ ƒ|_t tdd„ | ¡ D ƒƒ¡|_	|S )aL  Create a BigramProfile from pre-computed weighted frequencies.

        Computes ``weight_sum`` and ``input_norm`` from *weighted_freq* to
        ensure consistency between the stored fields.

        :param weighted_freq: Mapping of bigram index to weighted count.
        :returns: A new :class:`BigramProfile` instance.
        ó    r   r   c                 s   s    | ]}|| V  qd S ©Nr+   )Ú.0rf   r+   r+   r,   Ú	<genexpr>  s   € z3BigramProfile.from_weighted_freq.<locals>.<genexpr>)
rA   r   rX   rZ   ÚsumÚvaluesr[   r]   r^   rY   )Úclsrh   ÚprofilerX   rZ   rc   Úcountr+   r+   r,   Úfrom_weighted_freq  s   


€z BigramProfile.from_weighted_freq)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú	__slots__Úbytesrg   ÚclassmethodÚdictÚintrr   r+   r+   r+   r,   rW   Ï   s    ( rW   Ú rp   rG   Ú	model_keyc                 C   s¦   | j dkrdS tƒ }|r| |¡nd}|du r1d}tdƒD ]}|| }|r+||| 7 }qt |¡}|dkr7dS d}| j}	| jD ]}
|||
 |	|
  7 }q?||| j   S )zSScore a pre-computed bigram profile against a single model using cosine similarity.r\   Nr   r   )rY   rS   rO   r   r]   r^   rX   rZ   )rp   rG   r}   r   Ú
model_normÚsq_sumr(   rf   ÚdotrX   rc   r+   r+   r,   Úscore_with_profile  s&   
€

r   c                 C   sx   | s|du rdS t ƒ }| |¡}|du rdS |du rt| ƒ}d}d}|D ]\}}}	t|||	ƒ}
|
|kr7|
}|}q$||fS )a  Score data against all language variants of an encoding.

    Returns (best_score, best_language). Uses a pre-grouped index for O(L)
    lookup where L is the number of language variants for the encoding.

    If *profile* is provided, it is reused instead of recomputing the bigram
    frequency distribution from *data*.

    :param data: The raw byte data to score.
    :param encoding: The canonical encoding name to match against.
    :param profile: Optional pre-computed :class:`BigramProfile` to reuse.
    :returns: A ``(score, language)`` tuple with the best cosine-similarity
        score and the corresponding language code (or ``None``).
    N)r\   Nr\   )rM   rO   rW   r   )r   rN   rp   rE   ÚvariantsÚ
best_scoreÚ	best_langrH   rG   r}   Úsr+   r+   r,   Úscore_best_language6  s"   
€r†   )r|   rj   )-rv   Ú	functoolsÚimportlib.resourcesr2   r]   r   r7   r   Úchardet.registryr   r   ÚStructÚunpack_fromr   r   r   r   rz   ÚstrÚ__annotations__rn   Ú_encr   Ú	languagesr"   rx   Útupler   Úfloatr-   Úcacher<   r?   rD   rL   rM   rQ   ÚboolrR   rS   rU   rV   rW   r   r†   r+   r+   r+   r,   Ú<module>   sr    €ÿ
þD(
ÿ
þ*
	Pÿÿÿÿ
þýÿþýü