
    f jc-                         U d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
d	Zd
Zdez   Zeed<   dededefdZdededz  fdZdededz  fdZdededz  fdZdedefdZddededefdZdS )a  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)ASCII_TEXT_BYTESDETERMINISTIC_CONFIDENCEDetectionResulti      
   gQ?      ?gffffff?g333333?    _NULL_SEPARATOR_ALLOWEDdata	null_fracreturnc                 T    |t           k    rdS |                     dt                     S )u  Return True if the data looks like ASCII with null byte separators.

    :param data: The raw byte sample to examine.
    :param null_frac: The positional null fraction for this UTF-16 candidate
        (i.e. fraction of null bytes in even positions for BE, or odd positions
        for LE) — not the total null fraction across all bytes.

    Checks two conditions:
    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
    2. Every non-null byte is printable ASCII or common whitespace

    When both conditions are met, the nulls are likely field separators
    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
    FN)_NULL_SEPARATOR_MAX_FRACTION	translater
   )r   r   s     lC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\venv\Lib\site-packages\chardet/pipeline/utf1632.py_is_null_separator_patternr   6   s/     00 u~~d$;<<<<    c                     | dt                    }t          |          t          k     rdS t          |          }||S t	          |          S )a  Detect UTF-32 or UTF-16 encoding from null-byte patterns.

    UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
    N)_SAMPLE_SIZElen_MIN_BYTES_UTF16_check_utf32_check_utf16)r   sampleresults      r   detect_utf1632_patternsr   J   sW     -<- F
6{{%% t &!!F  r   c           	          t                     t                     dz  z
  }|t          k     rdS  d|          |dz  }t           fdt          dt                     d          D                       }t           fdt          dt                     d          D                       }||k    rV||z  dk    rM	                      d          }t          |          rt          dt          d          S n# t          $ r Y nw xY wt           fd	t          d
t                     d          D                       }t           fdt          dt                     d          D                       }||k    rV||z  dk    rM	                      d          }t          |          rt          dt          d          S n# t          $ r Y nw xY wdS )a  Check for UTF-32 encoding based on 4-byte unit structure.

    For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
    - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
    - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

    For BMP characters (U+0000 to U+FFFF), additionally:
    - UTF-32-BE: the second byte is also 0x00
    - UTF-32-LE: the third byte is also 0x00
       Nc              3   4   K   | ]}|         d k    dV  dS r      N .0ir   s     r   	<genexpr>z_check_utf32.<locals>.<genexpr>t   0      JJaT!W\JJJJJJJr   r   c              3   :   K   | ]}|d z            dk    d V  dS )r!   r   Nr"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>v   s5      OOqd1q5kQ>NOOOOOOOr   r   z	utf-32-beencoding
confidencelanguagec              3   4   K   | ]}|         d k    dV  dS r    r"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>   s0      IIQDGqLIqIIIIIIr      c              3   4   K   | ]}|         d k    dV  dS r    r"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>   r'   r      z	utf-32-le)	r   _MIN_BYTES_UTF32sumrangedecode_looks_like_textr   r   UnicodeDecodeError)r   trimmed_len	num_unitsbe_first_nullbe_second_nulltextle_last_nullle_third_nulls   `       r   r   r   `   s;    d))s4yy1}-K%% tDq I JJJJ5CIIq#9#9JJJJJMOOOOE!SYY$:$:OOOOON	! 
ny&@3&F 
		;;{++D%% &(7!    " 	 	 	D	 IIII%3t99a"8"8IIIIILJJJJ5CIIq#9#9JJJJJMy  
]Y%>%D 
		;;{++D%% &(7!    " 	 	 	D	 4s$   >:C: :
DD:G 
GGc                     t          t                     t                    }||dz  z  }|t          k     rdS |dz  }t	           fdt          d|d          D                       }t	           fdt          d|d          D                       }||z  }||z  }g }|t          k    r/t           d|         |          s|                    d|f           |t          k    r/t           d|         |          s|                    d|f           |sdS t          |          dk    re|d         d         }	  d|         	                    |          }	t          |	          rt          |t          d	          S n# t          $ r Y nw xY wdS d}
d
}|D ]M\  }}	  d|         	                    |          }	n# t          $ r Y 0w xY wt          |	          }||k    r|}|}
N|
"|t          k    rt          |
t          d	          S dS )a  Check for UTF-16 via null-byte patterns in alternating positions.

    UTF-16 encodes each BMP character as two bytes.  For characters whose
    code-point high byte is 0x00 (Latin, digits, basic punctuation, many
    control structures), one of the two bytes in each unit will be a null.
    Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
    fraction of code units still contain at least one null byte.

    Non-UTF-16 single-byte encodings never contain null bytes, so even a
    small null-byte fraction in alternating positions is a strong signal.

    When both endiannesses show null-byte patterns (e.g., Latin text where
    every other byte is null), we disambiguate by decoding both ways and
    comparing text-quality scores.
    r0   Nc              3   4   K   | ]}|         d k    dV  dS r    r"   r#   s     r   r&   z_check_utf16.<locals>.<genexpr>   0      KKad1glKKKKKKKr   r   c              3   4   K   | ]}|         d k    dV  dS r    r"   r#   s     r   r&   z_check_utf16.<locals>.<genexpr>   r@   r   r!   z	utf-16-lez	utf-16-ber)         )minr   r   r   r2   r3   _UTF16_MIN_NULL_FRACTIONr   appendr4   r5   r   r   r6   _text_quality_MIN_TEXT_QUALITY)r   
sample_lenr8   be_null_countle_null_countbe_fracle_frac
candidatesr*   r;   best_encodingbest_quality_qualitys   `             r   r   r      s     SYY--J*q. J$$ taI KKKK5J#:#:KKKKKMKKKK5J#:#:KKKKKMi'Gi'G*,J** 23M[j[74 4 2 	;0111** 23M[j[74 4 2 	;0111 t :! a=#		$++H55D%% &%7!    " 	 	 	D	t !%ML! % %!	$++H55DD! 	 	 	H	%%\! 	%"L$M 
\5F%F 
"/
 
 
 	
 4s%   8AE< <
F	F	F88
GGr;   c                     | sdS | dd         }t          d |D                       }|t          |          z  t          k    S )z9Quick check: is decoded text mostly printable characters.FN  c              3   J   K   | ]}|                                 s|d v dV  dS )
	r!   N)isprintable)r$   cs     r   r&   z#_looks_like_text.<locals>.<genexpr>   s8      JJ!JAMJAJJJJJJr   )r2   r   _MIN_PRINTABLE_FRACTION)r;   r   	printables      r   r5   r5      sO     u$3$ZFJJvJJJJJIs6{{"%<<<r   rS   limitc                    | d|         }t          |          }|dk    rdS d}d}d}d}d}|D ]s}	t          j        |	          }
|
d         dk    r|dz  }t          |	          dk     r|dz  }@|
d         dk    r|dz  }R|
dk    s|	d	v r|dz  }b|
d         d
k    r|dz  }t||z  dk    rdS ||z  dk    rdS ||z  }|||z  dz  z  }|dk    r|dk    r|dz  }|S )u  Score how much *text* looks like real human-readable content.

    Returns a score in the range [-1.0, ~1.6).  Higher values indicate
    more natural text.  The practical maximum is 1.5 for all-ASCII-letter
    input (1.6 approaches as sample size grows with all ASCII letters plus
    whitespace).  A score of -1.0 means the content is almost certainly not
    valid text (too many control characters or combining marks).

    Scoring factors:

    * Base score: ratio of Unicode letters (category ``L*``) to sample length.
    * ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
      primary signal for disambiguating endianness — correct decoding of
      Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
    * Space bonus: +0.1 when the sample contains at least one whitespace
      character and is longer than 20 characters.
    * Rejection: returns -1.0 if >10% control characters or >20% combining
      marks (category ``M*``).
    Nr   rB   Lr!      MZsrU   Cg?g?r      )r   unicodedatacategoryord)r;   rZ   r   nlettersmarksspacescontrolsascii_lettersrW   catscores               r   rF   rF      sa   ( &5&\FFAAv tGEFHM  "1%%q6S= 		qLG1vv| #"Vs] 	QJEED[ 	AM 	aKFFVs] 	MH !|c tqy3 taKE	ma3&&E2v &1* Lr   )rS   )__doc__rb   chardet.pipeliner   r   r   r   r1   r   rD   rG   rX   r   r
   bytes__annotations__floatboolr   r   r   r   strr5   intrF   r"   r   r   <module>ru      s  	 	 	     X X X X X X X X X X             $  ")+;!;  ; ; ;=U =u = = = = =( %  Od,B        ,5u 54!7 5 5 5 5pQu Q4!7 Q Q Q Qh=3 =4 = = = =9 9 9C 9% 9 9 9 9 9 9r   