
    f j&y                     P   U d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.  ededd          Z/dZ0dZ1 e2h d          Z3e2e4         e5d<    e2h d          Z6e2e7         e5d<    e2h d          Z8e2e7         e5d<    e2h d          Z9e2e7         e5d<    e2h d           Z:e2e7         e5d!<   e6e8e9e:d"Z;e<e4e2e7         f         e5d#<    e2h d$          Z=e2e7         e5d%<   d&d'd(Z>e<e4e4f         e5d)<   d*e?d+ed,e2e4         d-efd.Z@d/e4d,e2e4         d0e4d-eAe         fd1ZBd/e4d*e?d-eCfd2ZDd3ZEd4ZFd5ZGd6ZHd7ZId*e?d8eJe-d9f         d:ed-eJe-d9f         fd;ZKd*e?d<eAeJe4eLf                  d8eJe-d9f         d:ed-eAe         f
d=ZMd*e?d>eAe         d-eAe         fd?ZNd*e?d>eAe         d-eAe         fd@ZOdAZPd*e?d/e4d-e?dz  fdBZQd*e?d>eAe         d-eAe         fdCZRd*e?d>eAe         d-eAe         fdDZSefdddEdFdGd*e?dHedIe7dJe2e4         dz  dKe2e4         dz  dLe4dMe4d-eAe         fdNZTefdddEdFdGd*e?dHedIe7dJe2e4         dz  dKe2e4         dz  dLe4dMe4d-eAe         fdOZUdS )Pu   Pipeline orchestrator — runs all detection stages in sequence.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)_NONE_RESULTDETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_magic)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)REGISTRYEncodingInfoget_candidatesapplication/octet-stream)encoding
confidencelanguage	mime_typeg333333?i @  >   	iso8859-1
iso8859-15cp1252_COMMON_LATIN_ENCODINGS>.                                                                                                                                             _ISO_8859_10_DISTINGUISHING>   r(   r)   r+   r,   r-   r.   r0   r1   r2   r4      r5   r6   r7   r8   r9      r;   r<   r=   r>   r?   r@   rA   rB      rJ         rS      _ISO_8859_14_DISTINGUISHING>   rY      rZ   r[      r\   _WINDOWS_1254_DISTINGUISHING>   rC                     rD   rE      rF      rG            rH            rK   r^   rZ   _HP_ROMAN8_DISTINGUISHING)z
iso8859-10z
iso8859-14cp1254z	hp-roman8_DEMOTION_CANDIDATES>                           r(   r)   r,   r9   _KOI8_T_DISTINGUISHINGcp932cp949)shift_jis_2004euc_kr_MARKUP_SUPERSET_PROMOTIONSdatamarkup_resultallowedreturnc                    |j         |S t                              |j                   }|||vr|S t          |         }	 |                     |d           n# t
          t          f$ r |cY S w xY wt                      }t          | t          |j                  |          }t          | ||          }||k    r!t          ||j
        |j        |j                  S |S )aE  Promote a markup-declared encoding to its superset when structural evidence supports it.

    If the declared encoding has a known superset, the superset validates the
    data, and the superset's structural score is materially better, return a
    new result using the superset encoding.  Otherwise return the original.
    Nstricterrors)r    r   getr   decodeUnicodeDecodeErrorLookupErrorr   r   r   r!   r"   r#   )r   r   r   superset_namesuperset_infoctx
base_scoresuperset_scores           qC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\venv\Lib\site-packages\chardet/pipeline/orchestrator.py_try_promote_markup_supersetr      s     /33M4JKKM W < ]+MM(3333,    

C)$9O0PRUVVJ-dM3GGN
" 
$"#	
 
 	
 s   A A-,A-r    
param_namec                     | |vr*t          j        | d| dt          d           t          gS t	          | dd          gS )zReturn a low-confidence result for *encoding*, or ``encoding=None`` if filtered out.

    ``stacklevel=5`` targets the public caller:
    detect() -> run_pipeline() -> _run_pipeline_core() -> _make_fallback_or_none().
     zL is excluded by include_encodings/exclude_encodings; returning encoding=None   )
stacklevelg?N)r    r!   r"   )warningswarnUserWarningr	   r   )r    r   r   s      r   _make_fallback_or_noner     su     w  L LH L L L		
 	
 	
 	
 ~X$NNNOO    c                 x    t                               |           dS t          fd|D                        S )au  Return True if encoding is a demotion candidate with no distinguishing bytes.

    Checks whether any non-ASCII byte in *data* falls in the set of byte
    values that decode differently under the given encoding vs iso-8859-1.
    If none do, the data is equally valid under both encodings and there is
    no byte-level evidence for preferring the candidate encoding.
    NFc              3   ,   K   | ]}|d k    |v V  dS    N ).0bdistinguishings     r   	<genexpr>z!_should_demote.<locals>.<genexpr>7  s1      AA1DA1&AAAAAAr   )rq   r   any)r    r   r   s     @r   _should_demoter   ,  sK     *--h77N uAAAAAAAAAAAr   g?   gffffff?      valid_candidates.r   c                 ,   g }|D ] }|j         rt          | ||          }||j        |j        <   |t          k     r6|j        =t          |           t          |                     dt                              z
  |_        |j        t          k     rt          | |||j                  }||j        |j        <   |t          k     r|j        t          k    rt          | ||          }|t          k     r|                    |           t#          |          S )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

    Four checks are applied in order to each multi-byte candidate:

    1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
       >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

    2. **Minimum non-ASCII byte count**: the data must contain at least
       ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
       can accidentally form perfect pairs and score 1.0 structurally.

    3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
       total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
       text has many high bytes that are NOT consumed by multi-byte pairs;
       genuine CJK text has nearly all high bytes accounted for.

    4. **Lead byte diversity**: the number of distinct lead byte values in
       valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
       draws from a wide repertoire of lead bytes; European false positives
       cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

    Returns the filtered candidate list.  Structural scores are cached in
    ``ctx.mb_scores`` for reuse in Stage 2b.
    N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr   len	translater   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)r   r   r   gatedencmb_scorebyte_coveragelead_diversitys           r   _gate_cjk_candidatesr   X  s1   : !#E   	/c3??H&.CM#(#++ " X&)$ii#dnnT:6V6V2W2W&W#"%77 ;c30C  M )6COCH%55 "&BB !<T3!L!L!$;; S<<r   structural_scoresc           	      ,  
 d |D             
t          
fd|D                       }t          d |D                       }t          t          | dt                   g ||R                     }g }|D ]}|j        r |j                            |j        d          nd}	|	dk    r@|                    t          |j        |j	        d|	z   z  |j
        |j                             q|                    |           |                    d d	
           |S )a  Score structurally-valid CJK candidates using statistical bigrams.

    When multiple CJK encodings score equally high structurally, statistical
    scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
    Single-byte candidates are also scored and included so that the caller
    can compare CJK vs single-byte confidence.

    Multi-byte candidates with high byte coverage (>= 0.95) receive a
    confidence boost proportional to coverage.  When nearly all non-ASCII
    bytes form valid multi-byte pairs, the structural evidence is strong
    and should increase the candidate's ranking relative to single-byte
    alternatives whose bigram models may score higher on small samples.

    Note: boosted confidence values may exceed 1.0 and are used only for
    relative ranking among candidates.  ``run_pipeline`` clamps all
    confidence values to [0.0, 1.0] before returning to callers.
    c                 ,    i | ]}|j         	|j        |S r   )r   r   r   es     r   
<dictcomp>z0_score_structural_candidates.<locals>.<dictcomp>  s3     + + +q~+	+ + +r   c              3   6   K   | ]\  }}|v 	|         V  d S Nr   )r   r   _sc
enc_lookups      r   r   z/_score_structural_candidates.<locals>.<genexpr>  sJ        &T3tz?Q4     r   c              3   (   K   | ]}|j         	|V  d S r   )r   r   s     r   r   z/_score_structural_candidates.<locals>.<genexpr>  s)      JJa1>JJJJJJJr   N        gffffff?   c                     | j         S r   )r!   xs    r   <lambda>z._score_structural_candidates.<locals>.<lambda>  s    q| r   Tkeyreverse)r   listr   _STAT_SCORE_MAX_BYTESr    r   r   r   r   r!   r"   r#   sort)r   r   r   r   valid_mbsingle_byteresultsboostedrcoverager   s             @r   _score_structural_candidatesr     sm   .+ +++ + +J     *;    H JJ#3JJJJJK44457P7PK7P7PQQ G
 &(G 	 	;<:N3?&&qz37773t 	NNJH =qz1;     NN1LL++TL:::Nr   r   c                 x   t          |          dk    r|d         j        t          |d         j        |           r||d         j        |d         j        }|dd         D ]Wj        t          v rGt          j        |j        j                  }fd|D             }fd|D             }|g||c S X|S )a  Demote niche Latin encodings when no distinguishing bytes are present.

    Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
    on data that contains only bytes shared with common Western Latin
    encodings.  When there is no byte-level evidence for the winning
    encoding, promote the first common Western Latin candidate to the top and
    push the demoted encoding to last.
    r   r   Nc                 2    g | ]}|j         k    |u|S r   r    )r   r   demoted_encodingr   s     r   
<listcomp>z'_demote_niche_latin.<locals>.<listcomp>  sC       !*8H*HMNVWZ  r   c                 *    g | ]}|j         k    |S r   r   )r   r   r   s     r   r   z'_demote_niche_latin.<locals>.<listcomp>  s&    "X"X"XGW9W"X1"X"X"Xr   )r   r    r   r!   r'   r   r"   r#   )r   r   top_confpromotedothersdemoted_entriesr   r   s         @@r   _demote_niche_latinr     s    	Gq=AJ= 71:.55=
 #1:.1:( 		= 		=Az44 =*J!*ak     &   #Y"X"X"Xg"X"X"X <6<O<<<<= Nr   c                 f   |r|d         j         dk    r|S t          d t          |          D             d          |S t          d | D                       rV|         }|d         j        }t          |j         ||j        |j                  }fdt          |          D             }|g|S |S )a  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

    KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
    making statistical discrimination difficult.  However, KOI8-T maps 12
    bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
    box-drawing characters.  If any of these bytes appear, KOI8-T is the
    better match.
    r   zkoi8-rc              3   6   K   | ]\  }}|j         d k    |V  dS )zkoi8-tNr   )r   ir   s      r   r   z!_promote_koi8t.<locals>.<genexpr>  s2      QQDAq!*:PQaQQQQQQr   Nc              3   4   K   | ]}|d k    |t           v V  dS r   )rz   )r   r   s     r   r   z!_promote_koi8t.<locals>.<genexpr>  s1      
A
A1D
A1&&
A
A
A
A
A
Ar   c                 &    g | ]\  }}|k    |S r   r   )r   r   r   	koi8t_idxs      r   r   z"_promote_koi8t.<locals>.<listcomp>  s&    EEE1a9nE!EEEr   )r    next	enumerater   r!   r   r"   r#   )r   r   koi8t_resultr   r   r   r   s         @r   _promote_koi8tr     s      gaj)X5 QQIg$6$6QQQSWXXI 

A
A
A
A
AAA 
#y)1:("!!"	
 
 FEEE	' 2 2EEE"6""Nr   i   c                     |dk    r| S 	 |                      |d                              dd          S # t          t          t          f$ r Y dS w xY w)aP  Decode data from encoding and re-encode as UTF-8 for language scoring.

    Returns None if the encoding is unknown. For UTF-8, returns data as-is.
    Uses ``errors="ignore"`` because the data already passed byte-validity
    filtering for the detected encoding; any residual invalid bytes are
    irrelevant for language scoring.
    utf-8ignorer   surrogatepassN)r   encoder   	TypeError
ValueError)r   r    s     r   _to_utf8r     sx     7 {{8H{55<<O = 
 
 	
 J/   tts   +6 AAc           	         g }d}d}|D ]5}|j         }||j        t          |j                  }|A| r?t          |j                  r+|t	          |           }t          | |j        |          \  }}|Y| rWt          d          rHt          | |j                  }|r1||j        dk    rt	          |          }t          |d|          \  }}|j        }	|	|j        dnd}	||j         k    s|	|j        k    r1|                    t          |j        |j
        ||	                      |                    |           7|S )a  Fill in language and mime_type for results missing them.

    **Language** (only for text results where ``encoding is not None``):

    Tier 1: single-language encodings via hardcoded map (instant).
    Tier 2: multi-language encodings via statistical bigram scoring (lazy).
    Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).

    **MIME type**: text results default to ``"text/plain"``, binary results
    (``encoding is None``) default to ``"application/octet-stream"``.
    N)profiler   z
text/plainr   )r"   r    r   r   r   r   r   r#   r   r   r!   )
r   r   filledr   utf8_profileresultlang_	utf8_datamimes
             r   _fill_metadatar    s    %'F$(G)-L !" !" 	FO 	!&/22D V V);FO)L)L V 2+D11G-dFOWUUU4  );G)D)D $T6?;;	 # @v'/I @'4Y'?'?1!7L  GAt  	 ?0/  6?" 	"df.>&> 	"MM1BD$OO    MM&!!!!Mr   c                 b    t          | |          }t          | |          }t          | |          S )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )r   r   s     r   _postprocess_resultsr  O  s2    
 'tW55G!$00G$(((r   r&   r   include_encodingsexclude_encodingsno_match_encodingempty_input_encodingencoding_era	max_bytesr	  r
  r  r  c                L   t                      }| d|         } t          |||          }t          d |D                       }	| st          ||	d          S t	          |           }
|
|
j        |	v r|
gS t          |           }||j        |	v r|gS t          |           }||j        |j        |	v r|gS t          |           }||gS t          |           }t          |           }||t          | |          rt          gS t          |           }||j        |	v rt          | ||	          }|gS ||j        |	v r|gS ||j        |	v r|gS t          | |          }|st          ||	d          S t!          | ||          }|st          ||	d          S g }|D ]]}|j        rT|j                            |j                  }|t+          | ||          }|dk    r|                    |j        |f           ^|rR|                    d d	           |d
         \  }}|t0          k    r$t3          | |||          }|rt5          | |          S | dt6                   }t9          t;          |t=          |                              }|st          ||	d          S t5          | |          S )zBCore pipeline logic. Returns list of results sorted by confidence.Nc              3   $   K   | ]}|j         V  d S r   )r   )r   r   s     r   r   z%_run_pipeline_core.<locals>.<genexpr>k  s$      'G'GS'G'G'G'G'G'Gr   r  )r  r  r   c                     | d         S )Nr   r   r   s    r   r   z$_run_pipeline_core.<locals>.<lambda>  s
    QqT r   Tr   r   )r   r   	frozensetr   r   r    r   r   r   r   r   r   _BINARY_RESULTr   r   r   r   r   r   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r  r   r   r   r   )r   r  r  r	  r
  r  r  r   
candidatesr   
bom_resultutf1632_resultescape_resultmagic_resultutf8_precheckascii_precheckr   r   r   r   scorer  
best_scorer   	stat_datas                            r   _run_pipeline_corer  Y  s    

C

D
  .?ARSSJ''G'GJ'G'G'GGGG 
% '+A
 
 	
 D!!J *"5"@ |
 -T22N  n&=&H  
 +400M" "g-
   %%L ~  %%M "$''N
 	   di000 
 
 *$//M ]%;w%F 4T='RR   n&=&H    ]%;w%F  *$
;; W%&7BUVVV ,D2BCHH W%&7BUVVV 24 < < 	<M%%ch//E A0sC@@s{ <!((#(E):;;;  ;>>4@@@)!,:99 	;2')93 G  ;+D':::
 +++,I#Iu5E/F/FGGHHG W%&7BUVVVg...r   c          	          t          | ||||||          }t          | dt                   |          }|sd}t          |          d |D             S )aU  Run the full detection pipeline.

    :param data: The raw byte data to analyze.
    :param encoding_era: Filter candidates to a specific era of encodings.
    :param max_bytes: Maximum number of bytes to process.
    :param include_encodings: If not ``None``, only return these encodings.
    :param exclude_encodings: If not ``None``, never return these encodings.
    :param no_match_encoding: Encoding returned when no candidate survives.
    :param empty_input_encoding: Encoding returned for empty input.
    :returns: A list of :class:`DetectionResult` sorted by confidence descending.
    r  Nz/pipeline must always return at least one resultc           	          g | ]C}|j         d k    r4t          |j        t          |j         d           |j        |j                  n|DS )g      ?)r!   r   r    minr"   r#   )r   r   s     r   r   z run_pipeline.<locals>.<listcomp>  s^         <#	
Cc$:$:AJTTT  r   )r  r  _LANG_SCORE_MAX_BYTESRuntimeError)	r   r  r  r	  r
  r  r  r   msgs	            r   run_pipeliner&    s    * !+++1  G T"8#8"897CCG  ?3  	   r   )V__doc__r   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r   chardet.pipeliner	   r
   r   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.magicr   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   r  r  r   r  r'   str__annotations__rV   intr]   r`   ro   rq   dictrz   r   bytesr   r   r   boolr   r   r   r   r   r   r   r   floatr   r   r   r#  r   r  r  r  r&  r   r   r   <module>r@     s      , , , , , , % % % % % %                         0 / / / / / - - - - - - + + + + + + ? ? ? ? ? ? : : : : : : / / / / / / 9 9 9 9 9 9 9 9 9 9 9 9         
 . - - - - - < < < < < < 8 8 8 8 8 8 C C C C C C C C C C '(	   $(     +4)  + + 3    /8i/ / /1/ 1/ Ys^ 1 1 1n /8i     "/ "/ Ys^ " " "T 09y(((0 0 in    -6I  - - 9S>   : .-**	3 3 d3	#./    *3LLL* * 	#    / / T#s(^   !
!"! s^! 	! ! ! !HPPs^P P 
/	P P P P*BS B B$ B B B B$          " 3
3L#-.3 
3 <	3 3 3 3l/
/E#u*-./ L#-./ 
	/
 
// / / /d
/" 
/   @
/" 
/   H  5 C EDL    $3
3/3	/3 3 3 3l)
)/") 
/) ) ) ) 'J/
 04/3% 'J/ J/ J/
J/J/ J/
 !~,J/ !~,J/ J/ J/ 
/J/ J/ J/ J/` ',
 04/3% ', , ,
,, ,
 !~,, !~,, , , 
/, , , , , ,r   