o
    jv                     @   s  U d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ededddZ/dZ0dZ1e2h dZ3e2e4 e5d< e2h dZ6e2e7 e5d< e2h dZ8e2e7 e5d< e2h dZ9e2e7 e5d< e2h d Z:e2e7 e5d!< e6e8e9e:d"Z;e<e4e2e7 f e5d#< e2h d$Z=e2e7 e5d%< d&d'd(Z>e<e4e4f e5d)< d*e?d+ed,e2e4 d-efd.d/Z@d0e4d,e2e4 d1e4d-eAe fd2d3ZBd0e4d*e?d-eCfd4d5ZDd6ZEd7ZFd8ZGd9ZHd:ZId*e?d;eJe-d<f d=ed-eJe-d<f fd>d?ZKd*e?d@eAeJe4eLf  d;eJe-d<f d=ed-eAe f
dAdBZMd*e?dCeAe d-eAe fdDdEZNd*e?dCeAe d-eAe fdFdGZOdHZPd*e?d0e4d-e?dB fdIdJZQd*e?dCeAe d-eAe fdKdLZRd*e?dCeAe d-eAe fdMdNZSefdddOdPdQd*e?dRedSe7dTe2e4 dB dUe2e4 dB dVe4dWe4d-eAe fdXdYZTefdddOdPdQd*e?dRedSe7dTe2e4 dB dUe2e4 dB dVe4dWe4d-eAe fdZd[ZUdS )\u   Pipeline orchestrator — runs all detection stages in sequence.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)_NONE_RESULTDETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_magic)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)REGISTRYEncodingInfoget_candidatesapplication/octet-stream)encoding
confidencelanguage	mime_typeg333333?i @  >   	iso8859-1
iso8859-15cp1252_COMMON_LATIN_ENCODINGS>.                                                                                                                                             _ISO_8859_10_DISTINGUISHING>   r'   r(   r*   r+   r,   r-   r/   r0   r1   r3      r4   r5   r6   r7   r8      r:   r;   r<   r=   r>   r?   r@   rA      rI         rR      _ISO_8859_14_DISTINGUISHING>   rX      rY   rZ      r[   _WINDOWS_1254_DISTINGUISHING>   rB                     rC   rD      rE      rF            rG            rJ   r]   rY   _HP_ROMAN8_DISTINGUISHING)z
iso8859-10z
iso8859-14cp1254z	hp-roman8_DEMOTION_CANDIDATES>                           r'   r(   r+   r8   _KOI8_T_DISTINGUISHINGcp932cp949)shift_jis_2004euc_kr_MARKUP_SUPERSET_PROMOTIONSdatamarkup_resultallowedreturnc              	   C   s   |j du r|S t|j }|du s||vr|S t| }z	| j|dd W n ttfy1   | Y S w t }t| t|j  |}t| ||}||krRt	||j
|j|jS |S )aE  Promote a markup-declared encoding to its superset when structural evidence supports it.

    If the declared encoding has a known superset, the superset validates the
    data, and the superset's structural score is materially better, return a
    new result using the superset encoding.  Otherwise return the original.
    Nstricterrors)r   r~   getr   decodeUnicodeDecodeErrorLookupErrorr   r   r   r    r!   r"   )r   r   r   superset_namesuperset_infoctx
base_scoresuperset_score r   c/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/pipeline/orchestrator.py_try_promote_markup_superset   s,   
r   r   
param_namec                 C   s:   | |vrt j| d| dtdd tgS t| dddgS )zReturn a low-confidence result for *encoding*, or ``encoding=None`` if filtered out.

    ``stacklevel=5`` targets the public caller:
    detect() -> run_pipeline() -> _run_pipeline_core() -> _make_fallback_or_none().
     zL is excluded by include_encodings/exclude_encodings; returning encoding=None   )
stacklevelg?N)r   r    r!   )warningswarnUserWarningr   r   )r   r   r   r   r   r   _make_fallback_or_none  s   
r   c                    s.   t |   du rdS t fdd|D  S )au  Return True if encoding is a demotion candidate with no distinguishing bytes.

    Checks whether any non-ASCII byte in *data* falls in the set of byte
    values that decode differently under the given encoding vs iso-8859-1.
    If none do, the data is equally valid under both encodings and there is
    no byte-level evidence for preferring the candidate encoding.
    NFc                 3   s     | ]}|d kr| v V  qdS    Nr   .0bdistinguishingr   r   	<genexpr>7      z!_should_demote.<locals>.<genexpr>)rp   r   any)r   r   r   r   r   _should_demote,  s   
r   g?   gffffff?      valid_candidates.r   c                 C   s   g }|D ]W}|j rVt| ||}||j|j< |tk rq|jdu r,t| t| dt |_|jt	k r2qt
| |||jd}||j|j< |tk rFq|jtkrVt| ||}|tk rVq|| qt|S )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

    Four checks are applied in order to each multi-byte candidate:

    1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
       >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

    2. **Minimum non-ASCII byte count**: the data must contain at least
       ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
       can accidentally form perfect pairs and score 1.0 structurally.

    3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
       total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
       text has many high bytes that are NOT consumed by multi-byte pairs;
       genuine CJK text has nearly all high bytes accounted for.

    4. **Lead byte diversity**: the number of distinct lead byte values in
       valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
       draws from a wide repertoire of lead bytes; European false positives
       cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

    Returns the filtered candidate list.  Structural scores are cached in
    ``ctx.mb_scores`` for reuse in Stage 2b.
    N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr   len	translater
   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)r   r   r   gatedencmb_scorebyte_coveragelead_diversityr   r   r   _gate_cjk_candidatesX  s.   



r   structural_scoresc           
         s   dd |D  t  fdd|D }t dd |D }tt| dt g ||R }g }|D ]+}|jr<|j|jdnd}	|	dkrU|t|j|j	d	|	  |j
|j q/|| q/|jd
d dd |S )a  Score structurally-valid CJK candidates using statistical bigrams.

    When multiple CJK encodings score equally high structurally, statistical
    scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
    Single-byte candidates are also scored and included so that the caller
    can compare CJK vs single-byte confidence.

    Multi-byte candidates with high byte coverage (>= 0.95) receive a
    confidence boost proportional to coverage.  When nearly all non-ASCII
    bytes form valid multi-byte pairs, the structural evidence is strong
    and should increase the candidate's ranking relative to single-byte
    alternatives whose bigram models may score higher on small samples.

    Note: boosted confidence values may exceed 1.0 and are used only for
    relative ranking among candidates.  ``run_pipeline`` clamps all
    confidence values to [0.0, 1.0] before returning to callers.
    c                 S   s   i | ]	}|j r|j|qS r   )r   r   r   er   r   r   
<dictcomp>  s
    z0_score_structural_candidates.<locals>.<dictcomp>c                 3   s$    | ]\}}| v r | V  qd S Nr   )r   r   _sc
enc_lookupr   r   r     s    z/_score_structural_candidates.<locals>.<genexpr>c                 s   s    | ]}|j s|V  qd S r   )r   r   r   r   r   r     s    N        gffffff?   c                 S   s   | j S r   )r    xr   r   r   <lambda>  s    z._score_structural_candidates.<locals>.<lambda>Tkeyreverse)r   listr   _STAT_SCORE_MAX_BYTESr   r   r   r   r   r    r!   r"   sort)
r   r   r   r   valid_mbsingle_byteresultsboostedrcoverager   r   r   _score_structural_candidates  s,   r   r   c                    s   t |dkrS|d jdurSt|d j| rS|d j |d j}|dd D ]-jtv rRtj|jj} fdd|D } fdd|D }|g||  S q%|S )a  Demote niche Latin encodings when no distinguishing bytes are present.

    Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
    on data that contains only bytes shared with common Western Latin
    encodings.  When there is no byte-level evidence for the winning
    encoding, promote the first common Western Latin candidate to the top and
    push the demoted encoding to last.
    r   r   Nc                    s"   g | ]}|j  kr|ur|qS r   r   r   r   demoted_encodingr   r   r   
<listcomp>  s    z'_demote_niche_latin.<locals>.<listcomp>c                    s   g | ]	}|j  kr|qS r   r   r   )r   r   r   r     s    )r   r   r   r    r&   r   r!   r"   )r   r   top_confpromotedothersdemoted_entriesr   r   r   _demote_niche_latin  s$   


	r   c                    s   |r	|d j dkr|S tdd t|D d  du r|S tdd | D rI|  }|d j}t|j ||j|j} fddt|D }|g|S |S )	a  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

    KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
    making statistical discrimination difficult.  However, KOI8-T maps 12
    bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
    box-drawing characters.  If any of these bytes appear, KOI8-T is the
    better match.
    r   zkoi8-rc                 s   s"    | ]\}}|j d kr|V  qdS )zkoi8-tNr   r   ir   r   r   r   r     s     z!_promote_koi8t.<locals>.<genexpr>Nc                 s   s     | ]}|d kr|t v V  qdS r   )ry   r   r   r   r   r     r   c                    s   g | ]
\}}| kr|qS r   r   r   	koi8t_idxr   r   r     s    z"_promote_koi8t.<locals>.<listcomp>)r   next	enumerater   r    r   r!   r"   )r   r   koi8t_resultr   r   r   r   r   r   _promote_koi8t  s"   

r   i   c              
   C   sB   |dkr| S z| j |ddjdddW S  tttfy    Y dS w )aP  Decode data from encoding and re-encode as UTF-8 for language scoring.

    Returns None if the encoding is unknown. For UTF-8, returns data as-is.
    Uses ``errors="ignore"`` because the data already passed byte-validity
    filtering for the detected encoding; any residual invalid bytes are
    irrelevant for language scoring.
    utf-8ignorer   surrogatepassN)r   encoder   	TypeError
ValueError)r   r   r   r   r   _to_utf8  s   r   c           
      C   s  g }d}d}|D ]}|j }|du r`|jdur`t|j}|du r8| r8t|jr8|du r.t| }t| |j|d\}}|du r`| r`tdr`t| |j}|r`|du sS|jdkrWt|}t|d|d\}}|j}	|	du rp|jdurndnd}	||j ksz|	|jkr|t	|j|j
||	 q|| q|S )a  Fill in language and mime_type for results missing them.

    **Language** (only for text results where ``encoding is not None``):

    Tier 1: single-language encodings via hardcoded map (instant).
    Tier 2: multi-language encodings via statistical bigram scoring (lazy).
    Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).

    **MIME type**: text results default to ``"text/plain"``, binary results
    (``encoding is None``) default to ``"application/octet-stream"``.
    N)profiler   z
text/plainr   )r!   r   r   r   r   r   r   r"   r   r   r    )
r   r   filledr   utf8_profileresultlang_	utf8_datamimer   r   r   _fill_metadata  s>   


r   c                 C   s   t | |}t| |}t| |S )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )r   r   r   r   r   _postprocess_resultsO  s   


r   r%   r   include_encodingsexclude_encodingsno_match_encodingempty_input_encodingencoding_era	max_bytesr   r   r   r   c                C   sX  t  }| d| } t|||}tdd |D }	| s t||	dS t| }
|
dur0|
j|	v r0|
gS t| }|dur@|j|	v r@|gS t| }|durU|jdurU|j|	v rU|gS t| }|dur`|gS t	| }t
| }|du ry|du ryt| |drytgS t| }|dur|j|	v rt| ||	}|gS |dur|j|	v r|gS |dur|j|	v r|gS t| |}|st||	dS t| ||}|st||	dS g }|D ]"}|jr|j|j}|du rt| ||}|dkr||j|f q|r|jdd	 d
d |d \}}|tkrt| |||}|rt| |S | dt }tt|t|}|s't||	dS t| |S )zBCore pipeline logic. Returns list of results sorted by confidence.Nc                 s   s    | ]}|j V  qd S r   )r   )r   r   r   r   r   r   k  s    z%_run_pipeline_core.<locals>.<genexpr>r   )r   r   r   c                 S   s   | d S )Nr   r   r   r   r   r   r     s    z$_run_pipeline_core.<locals>.<lambda>Tr   r   )r   r   	frozensetr   r   r   r   r   r   r   r   r   _BINARY_RESULTr   r   r   r   r   r   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r   r   r   r   r   )r   r   r   r   r   r   r   r   
candidatesr   
bom_resultutf1632_resultescape_resultmagic_resultutf8_precheckascii_precheckr   r   r   r   scorer   
best_scorer   	stat_datar   r   r   _run_pipeline_coreY  s   






r  c          	   	   C   sF   t | ||||||d}t| dt |}|sd}t|dd |D S )aU  Run the full detection pipeline.

    :param data: The raw byte data to analyze.
    :param encoding_era: Filter candidates to a specific era of encodings.
    :param max_bytes: Maximum number of bytes to process.
    :param include_encodings: If not ``None``, only return these encodings.
    :param exclude_encodings: If not ``None``, never return these encodings.
    :param no_match_encoding: Encoding returned when no candidate survives.
    :param empty_input_encoding: Encoding returned for empty input.
    :returns: A list of :class:`DetectionResult` sorted by confidence descending.
    r   Nz/pipeline must always return at least one resultc                 S   s6   g | ]}|j d krt|jt|j d |j|jn|qS )g      ?)r    r   r   minr!   r"   )r   r   r   r   r   r     s    
z run_pipeline.<locals>.<listcomp>)r  r   _LANG_SCORE_MAX_BYTESRuntimeError)	r   r   r   r   r   r   r   r   msgr   r   r   run_pipeline  s    r  )V__doc__r   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r   chardet.pipeliner   r	   r
   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.magicr   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   r  r  r   r  r&   str__annotations__rU   intr\   r_   rn   rp   dictry   r~   bytesr   r   r   boolr   r   r   r   r   r   r   r   floatr   r   r   r  r   r   r   r  r  r   r   r   r   <module>   s\   7*

$



6

2
 
$
6


	
 

	