o
    j.                     @   s$  U d Z ddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
 edjZedjZdZi Zeeef ed< e	 D ]ZeejdkrPejd eej< q?d	ed
eeeef eeef f fddZejd
eeeef eeef f fddZd
eeef fddZ deeef d
eee!eedB eef  f fddZ"ejd
eee!eedB eef  f fddZ#ded
edB fddZ$ded
e%fddZ&d
eeef fddZ'ejd
e(fddZ)G dd  d Z*	!d)d"e*d#e(eB d$ed
efd%d&Z+	d*d	eded"e*dB d
eeedB f fd'd(Z,dS )+zModel loading and bigram scoring utilities.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)REGISTRYlookup_encodingz>Iz>ds   CMD2_SINGLE_LANG_MAP   datareturnc              
   C   s  z| dd t krd}t|d}t| |\}|d7 }|dkr)d| d}t|g }i }t|D ]<}t| |\}|d7 }|dkrKd| d	}t|| |||  d
}||7 }t| |\}	|d7 }|| |	||< q1t| |d }
|d }t	|
|krdt	|
 d| }t|t
|
}i }t|D ]\}}|d }|||d  ||< qW ||fS  tjy } z
d| }t||d}~w tjtfy } z
d| }t||d}~ww )zParse the v2 dense zlib-compressed models.bin format.

    :param data: Raw bytes of models.bin (must be non-empty).
    :returns: A ``(models, norms)`` tuple.
    :raises ValueError: If the data is corrupt or truncated.
    N   z&corrupt models.bin: missing CMD2 magici'  zcorrupt models.bin: num_models=z exceeds limit   zcorrupt models.bin: name_len=z exceeds 256zutf-8      z&corrupt models.bin: decompressed size z != expected zcorrupt models.bin: )	_V2_MAGIC
ValueError_unpack_uint32rangedecode_unpack_float64appendzlib
decompresslen
memoryview	enumerateerrorstructUnicodeDecodeError)r   msgoffset
num_modelsnamesnorms_name_lennamenormblobexpected_sizemvmodelsistarte r+   ]/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/models/__init__.py_parse_models_bin   s`   	





r-   c                  C   s>   t jdd} |  }|stjdtdd i i fS t|S )zkLoad and parse models.bin, returning (models, norms).

    Cached: only reads from disk on first call.
    chardet.modelsz
models.binuX   chardet models.bin is empty — statistical detection disabled; reinstall chardet to fix   
stacklevel)		importlib	resourcesfilesjoinpath
read_byteswarningswarnRuntimeWarningr-   refr   r+   r+   r,   _load_models_dataa   s   r<   c                   C   
   t  d S )zLoad all bigram models from the bundled models.bin file.

    Each model is a memoryview of length 65536 (256*256).
    Index: (b1 << 8) | b2 -> weight (0-255).

    :returns: A dict mapping model key strings to 65536-byte lookup tables.
    r   r<   r+   r+   r+   r,   load_modelsv   s   
r?   r'   c                 C   st   i }|   D ]\}}|dd\}}||g |||f qt|D ]}t|}|dur7||vr7|| ||< q#|S )zBuild a grouped index from a models dict.

    :param models: Mapping of ``"lang/encoding"`` keys to 65536-byte tables.
    :returns: Mapping of encoding name to ``[(lang, model, model_key), ...]``.
    /r   N)itemssplit
setdefaultr   listr   )r'   indexkeymodellangencenc_name	canonicalr+   r+   r,   _build_enc_index   s   rL   c                   C   s
   t t S )zTReturn a pre-grouped index mapping encoding name -> [(lang, model, model_key), ...].)rL   r?   r+   r+   r+   r,   get_enc_index   s   
rM   encodingc                 C   s
   t | S )zReturn the language for a single-language encoding, or None.

    :param encoding: The canonical encoding name.
    :returns: An ISO 639-1 language code, or ``None`` if the encoding is
        multi-language.
    )r   getrN   r+   r+   r,   infer_language   s   
rQ   c                 C   s
   | t  v S )zReturn True if the encoding has language variants in the model index.

    :param encoding: The canonical encoding name.
    :returns: ``True`` if bigram models exist for this encoding.
    )rM   rP   r+   r+   r,   has_model_variants   s   
rR   c                   C   r=   )zAReturn cached L2 norms for all models, keyed by model key string.r   r>   r+   r+   r+   r,   _get_model_norms   s   
rS   c                  C   sV   t jdd} |  }t|dkr'tjdt| dtdd t	dd S t	|S )	u  Return a 65536-byte IDF weight table for bigram profile construction.

    Loads a precomputed table from ``idf.bin`` (generated at training time).
    For each bigram index, the weight reflects how discriminative that bigram
    is across all models:

    - Bigrams in every model (common ASCII) → weight 1 (minimal signal)
    - Bigrams in one model → weight 255 (maximum signal)
    - Bigrams not in any model → weight 1 (unknown, treat as neutral)
    r.   zidf.binr   z chardet idf.bin has wrong size (z"), falling back to uniform weightsr/   r0      )
r2   r3   r4   r5   r6   r   r7   r8   r9   	bytearrayr:   r+   r+   r,   get_idf_weights   s   rV   c                   @   sD   e Zd ZdZdZdeddfddZedee	e	f dd fd	d
Z
dS )BigramProfileu  Pre-computed bigram frequency distribution for a data sample.

    Computing this once and reusing it across all models reduces per-model
    scoring from O(n) to O(distinct_bigrams).

    Stores a dense ``freq`` list of length 65536 indexed by bigram index, plus
    a ``nonzero`` list of indices with non-zero frequency for fast iteration.
    Each bigram is weighted by its IDF (inverse document frequency) across all
    models — bigrams unique to few models get high weight, bigrams common to
    all models get weight 1.
    )freq
input_normnonzero
weight_sumr   r   Nc                 C   s   t |d }|dkrg | _g | _d| _d| _dS t }dgd }g }d}t|D ])}|| d> ||d  B }|| }	|| dkrE|| ||  |	7  < ||	7 }q(|| _|| _|| _d}
|D ]}|| }|
|| 7 }
q_t	|
| _dS )a?  Compute the bigram frequency distribution for *data*.

        Each bigram is weighted by its IDF (inverse document frequency) across
        all loaded models.  Bigrams unique to few models get high weight;
        bigrams common to all models get weight 1.

        :param data: The raw byte data to profile.
        r   r           Nr   r
   )
r   rX   rZ   r[   rY   rV   r   r   mathsqrt)selfr   total_bigramsidfrX   rZ   w_sumr(   idxwnorm_sqvr+   r+   r,   __init__   s4   	


zBigramProfile.__init__weighted_freqc                 C   sz   | d}dgd }g }|  D ]\}}|||< |r|| q||_||_t| |_ttdd | D |_	|S )aL  Create a BigramProfile from pre-computed weighted frequencies.

        Computes ``weight_sum`` and ``input_norm`` from *weighted_freq* to
        ensure consistency between the stored fields.

        :param weighted_freq: Mapping of bigram index to weighted count.
        :returns: A new :class:`BigramProfile` instance.
            r   r   c                 s   s    | ]}|| V  qd S Nr+   ).0rf   r+   r+   r,   	<genexpr>  s    z3BigramProfile.from_weighted_freq.<locals>.<genexpr>)
rA   r   rX   rZ   sumvaluesr[   r]   r^   rY   )clsrh   profilerX   rZ   rc   countr+   r+   r,   from_weighted_freq  s   


z BigramProfile.from_weighted_freq)__name__
__module____qualname____doc__	__slots__bytesrg   classmethoddictintrr   r+   r+   r+   r,   rW      s    ( rW    rp   rG   	model_keyc                 C   s   | j dkrdS t }|r||nd}|du r1d}tdD ]}|| }|r+||| 7 }qt|}|dkr7dS d}| j}	| jD ]}
|||
 |	|
  7 }q?||| j   S )zSScore a pre-computed bigram profile against a single model using cosine similarity.r\   Nr   r   )rY   rS   rO   r   r]   r^   rX   rZ   )rp   rG   r}   r   
model_normsq_sumr(   rf   dotrX   rc   r+   r+   r,   score_with_profile  s&   


r   c                 C   sx   | s|du rdS t  }||}|du rdS |du rt| }d}d}|D ]\}}}	t|||	}
|
|kr7|
}|}q$||fS )a  Score data against all language variants of an encoding.

    Returns (best_score, best_language). Uses a pre-grouped index for O(L)
    lookup where L is the number of language variants for the encoding.

    If *profile* is provided, it is reused instead of recomputing the bigram
    frequency distribution from *data*.

    :param data: The raw byte data to score.
    :param encoding: The canonical encoding name to match against.
    :param profile: Optional pre-computed :class:`BigramProfile` to reuse.
    :returns: A ``(score, language)`` tuple with the best cosine-similarity
        score and the corresponding language code (or ``None``).
    N)r\   Nr\   )rM   rO   rW   r   )r   rN   rp   rE   variants
best_score	best_langrH   rG   r}   sr+   r+   r,   score_best_language6  s"   
r   )r|   rj   )-rv   	functoolsimportlib.resourcesr2   r]   r   r7   r   chardet.registryr   r   Structunpack_fromr   r   r   r   rz   str__annotations__rn   _encr   	languagesr"   rx   tupler   floatr-   cacher<   r?   rD   rL   rM   rQ   boolrR   rS   rU   rV   rW   r   r   r+   r+   r+   r,   <module>   sr    
D(

*
	P
