§
    ²f jc-  ã                   óÔ   — U d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
d	Zd
Zdez   Zeed<   dededefd„Zdededz  fd„Zdededz  fd„Zdededz  fd„Zdedefd„Zddededefd„ZdS )a«  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
é    N)ÚASCII_TEXT_BYTESÚDETERMINISTIC_CONFIDENCEÚDetectionResulti   é   é
   g¸…ëQ¸ž?ç      à?gffffffæ?g333333Ã?ó    Ú_NULL_SEPARATOR_ALLOWEDÚdataÚ	null_fracÚreturnc                 óT   — |t           k    rdS |                      dt          ¦  «         S )u‹  Return True if the data looks like ASCII with null byte separators.

    :param data: The raw byte sample to examine.
    :param null_frac: The positional null fraction for this UTF-16 candidate
        (i.e. fraction of null bytes in even positions for BE, or odd positions
        for LE) â€” not the total null fraction across all bytes.

    Checks two conditions:
    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
    2. Every non-null byte is printable ASCII or common whitespace

    When both conditions are met, the nulls are likely field separators
    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
    FN)Ú_NULL_SEPARATOR_MAX_FRACTIONÚ	translater
   )r   r   s     úlC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\venv\Lib\site-packages\chardet/pipeline/utf1632.pyÚ_is_null_separator_patternr   6   s/   € ð Õ0Ò0ð ØˆuØ~Š~˜dÕ$;Ñ<Ô<Ð<Ð<ó    c                 ó˜   — | dt           …         }t          |¦  «        t          k     rdS t          |¦  «        }||S t	          |¦  «        S )a  Detect UTF-32 or UTF-16 encoding from null-byte patterns.

    UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
    N)Ú_SAMPLE_SIZEÚlenÚ_MIN_BYTES_UTF16Ú_check_utf32Ú_check_utf16)r   ÚsampleÚresults      r   Údetect_utf1632_patternsr   J   sW   € ð -•<-Ô €Få
ˆ6{„{Õ%Ò%ð Øˆtõ ˜&Ñ!Ô!€FØð Øˆõ ˜ÑÔÐr   c           	      ó¬  ‡ — t          ‰ ¦  «        t          ‰ ¦  «        dz  z
  }|t          k     rdS ‰ d|…         Š |dz  }t          ˆ fd„t          dt          ‰ ¦  «        d¦  «        D ¦   «         ¦  «        }t          ˆ fd„t          dt          ‰ ¦  «        d¦  «        D ¦   «         ¦  «        }||k    rV||z  dk    rM	 ‰                      d¦  «        }t          |¦  «        rt          dt          d¬¦  «        S n# t          $ r Y nw xY wt          ˆ fd	„t          d
t          ‰ ¦  «        d¦  «        D ¦   «         ¦  «        }t          ˆ fd„t          dt          ‰ ¦  «        d¦  «        D ¦   «         ¦  «        }||k    rV||z  dk    rM	 ‰                      d¦  «        }t          |¦  «        rt          dt          d¬¦  «        S n# t          $ r Y nw xY wdS )a’  Check for UTF-32 encoding based on 4-byte unit structure.

    For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
    - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
    - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

    For BMP characters (U+0000 to U+FFFF), additionally:
    - UTF-32-BE: the second byte is also 0x00
    - UTF-32-LE: the third byte is also 0x00
    é   Nc              3   ó4   •K  — | ]}‰|         d k    ¯dV — ŒdS ©r   é   N© ©Ú.0Úir   s     €r   ú	<genexpr>z_check_utf32.<locals>.<genexpr>t   ó0   øè è € ÐJÐJ˜a¸TÀ!¼WÈº\ÐJ˜ÐJÐJÐJÐJÐJÐJr   r   c              3   ó:   •K  — | ]}‰|d z            dk    ¯d V — ŒdS )r!   r   Nr"   r#   s     €r   r&   z_check_utf32.<locals>.<genexpr>v   s5   øè è € ÐOÐO˜q¸dÀ1ÀqÁ5¼kÈQÒ>NÐO˜ÐOÐOÐOÐOÐOÐOr   r   z	utf-32-be©ÚencodingÚ
confidenceÚlanguagec              3   ó4   •K  — | ]}‰|         d k    ¯dV — ŒdS r    r"   r#   s     €r   r&   z_check_utf32.<locals>.<genexpr>…   s0   øè è € ÐIÐI˜Q¸DÀ¼GÀqºLÐIqÐIÐIÐIÐIÐIÐIr   é   c              3   ó4   •K  — | ]}‰|         d k    ¯dV — ŒdS r    r"   r#   s     €r   r&   z_check_utf32.<locals>.<genexpr>‡   r'   r   é   z	utf-32-le)	r   Ú_MIN_BYTES_UTF32ÚsumÚrangeÚdecodeÚ_looks_like_textr   r   ÚUnicodeDecodeError)r   Útrimmed_lenÚ	num_unitsÚbe_first_nullÚbe_second_nullÚtextÚle_last_nullÚle_third_nulls   `       r   r   r   `   s;  ø€ õ d‘)”)s 4™yœy¨1™}Ñ-€KØÕ%Ò%ð ØˆtØÔ€Dà˜qÑ €Iõ ÐJÐJÐJÐJ¥5¨­C°©I¬I°qÑ#9Ô#9ÐJÑJÔJÑJÔJ€MåÐOÐOÐOÐO¥E¨!­S°©Y¬Y¸Ñ$:Ô$:ÐOÑOÔOÑOÔO€Nà˜	Ò!ð 
 n°yÑ&@À3Ò&Fð 
ð		Ø—;’;˜{Ñ+Ô+ˆDÝ Ñ%Ô%ð Ý&Ø(Ý7Ø!ðñ ô ð ðøõ "ð 	ð 	ð 	ØˆDð	øøøõ ÐIÐIÐIÐI¥%¨­3¨t©9¬9°aÑ"8Ô"8ÐIÑIÔIÑIÔI€LåÐJÐJÐJÐJ¥5¨­C°©I¬I°qÑ#9Ô#9ÐJÑJÔJÑJÔJ€MàyÒ ð 
 ]°YÑ%>ÀÒ%Dð 
ð		Ø—;’;˜{Ñ+Ô+ˆDÝ Ñ%Ô%ð Ý&Ø(Ý7Ø!ðñ ô ð ðøõ "ð 	ð 	ð 	ØˆDð	øøøð ˆ4s$   Â>:C: Ã:
DÄDÆ:G Ç
GÇGc                 ó  ‡ — t          t          ‰ ¦  «        t          ¦  «        }||dz  z  }|t          k     rdS |dz  }t	          ˆ fd„t          d|d¦  «        D ¦   «         ¦  «        }t	          ˆ fd„t          d|d¦  «        D ¦   «         ¦  «        }||z  }||z  }g }|t          k    r/t          ‰ d|…         |¦  «        s|                     d|f¦  «         |t          k    r/t          ‰ d|…         |¦  «        s|                     d|f¦  «         |sdS t          |¦  «        dk    re|d         d         }	 ‰ d|…          	                    |¦  «        }	t          |	¦  «        rt          |t          d¬	¦  «        S n# t          $ r Y nw xY wdS d}
d
}|D ]M\  }}	 ‰ d|…          	                    |¦  «        }	n# t          $ r Y Œ0w xY wt          |	¦  «        }||k    r|}|}
ŒN|
"|t          k    rt          |
t          d¬	¦  «        S dS )aý  Check for UTF-16 via null-byte patterns in alternating positions.

    UTF-16 encodes each BMP character as two bytes.  For characters whose
    code-point high byte is 0x00 (Latin, digits, basic punctuation, many
    control structures), one of the two bytes in each unit will be a null.
    Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
    fraction of code units still contain at least one null byte.

    Non-UTF-16 single-byte encodings never contain null bytes, so even a
    small null-byte fraction in alternating positions is a strong signal.

    When both endiannesses show null-byte patterns (e.g., Latin text where
    every other byte is null), we disambiguate by decoding both ways and
    comparing text-quality scores.
    r0   Nc              3   ó4   •K  — | ]}‰|         d k    ¯dV — ŒdS r    r"   r#   s     €r   r&   z_check_utf16.<locals>.<genexpr>°   ó0   øè è € ÐKÐK˜a¸dÀ1¼gÈºlÐK˜ÐKÐKÐKÐKÐKÐKr   r   c              3   ó4   •K  — | ]}‰|         d k    ¯dV — ŒdS r    r"   r#   s     €r   r&   z_check_utf16.<locals>.<genexpr>²   r@   r   r!   z	utf-16-lez	utf-16-ber)   ç      ð¿)Úminr   r   r   r2   r3   Ú_UTF16_MIN_NULL_FRACTIONr   Úappendr4   r5   r   r   r6   Ú_text_qualityÚ_MIN_TEXT_QUALITY)r   Ú
sample_lenr8   Úbe_null_countÚle_null_countÚbe_fracÚle_fracÚ
candidatesr*   r;   Úbest_encodingÚbest_qualityÚ_Úqualitys   `             r   r   r   ˜   s½  ø€ õ  •S˜‘Y”Y¥Ñ-Ô-€JØ*˜q‘.Ñ €JØÕ$Ò$ð Øˆtà˜a‘€Iõ ÐKÐKÐKÐK¥5¨¨J¸Ñ#:Ô#:ÐKÑKÔKÑKÔK€MåÐKÐKÐKÐK¥5¨¨J¸Ñ#:Ô#:ÐKÑKÔKÑKÔK€Mà˜iÑ'€GØ˜iÑ'€Gà*,€JØÕ*Ò*ð 2Õ3MØˆ[ˆjˆ[Ô˜7ñ4ô 4ð 2ð 	×Ò˜;¨Ð0Ñ1Ô1Ð1ØÕ*Ò*ð 2Õ3MØˆ[ˆjˆ[Ô˜7ñ4ô 4ð 2ð 	×Ò˜;¨Ð0Ñ1Ô1Ð1àð Øˆtõ ˆ:„˜!Òð Ø˜a”= Ô#ˆð		Ø˜˜˜Ô$×+Ò+¨HÑ5Ô5ˆDÝ Ñ%Ô%ð Ý&Ø%Ý7Ø!ðñ ô ð ðøõ "ð 	ð 	ð 	ØˆDð	øøøàˆtð !%€MØ€Là!ð %ð %‰ˆ!ð	Ø˜˜˜Ô$×+Ò+¨HÑ5Ô5ˆDˆDøÝ!ð 	ð 	ð 	ØˆHð	øøøå Ñ%Ô%ˆØ\Ò!ð 	%Ø"ˆLØ$ˆMøàð 
 \Õ5FÒ%Fð 
ÝØ"Ý/Øð
ñ 
ô 
ð 	
ð ˆ4s%   Ä8AE< Å<
F	ÆF	ÆF8Æ8
GÇGr;   c                 ó†   — | sdS | dd…         }t          d„ |D ¦   «         ¦  «        }|t          |¦  «        z  t          k    S )z9Quick check: is decoded text mostly printable characters.FNéô  c              3   óJ   K  — | ]}|                      ¦   «         s|d v ¯dV — ŒdS )ú
	r!   N)Úisprintable)r$   Úcs     r   r&   z#_looks_like_text.<locals>.<genexpr>ñ   s8   è è € ÐJÐJ˜!¨¯ª©¬ÐJ¸AÀ¸MÐJAÐJÐJÐJÐJÐJÐJr   )r2   r   Ú_MIN_PRINTABLE_FRACTION)r;   r   Ú	printables      r   r5   r5   ì   sO   € àð ØˆuØ$3$ŒZ€FÝÐJÐJ˜vÐJÑJÔJÑJÔJ€IØ•s˜6‘{”{Ñ"Õ%<Ò<Ð<r   rS   Úlimitc                 ó¶  — | d|…         }t          |¦  «        }|dk    rdS d}d}d}d}d}|D ]s}	t          j        |	¦  «        }
|
d         dk    r|dz  }t          |	¦  «        dk     r|dz  }Œ@|
d         dk    r|dz  }ŒR|
dk    s|	d	v r|dz  }Œb|
d         d
k    r|dz  }Œt||z  dk    rdS ||z  dk    rdS ||z  }|||z  dz  z  }|dk    r|dk    r|dz  }|S )uØ  Score how much *text* looks like real human-readable content.

    Returns a score in the range [-1.0, ~1.6).  Higher values indicate
    more natural text.  The practical maximum is 1.5 for all-ASCII-letter
    input (1.6 approaches as sample size grows with all ASCII letters plus
    whitespace).  A score of -1.0 means the content is almost certainly not
    valid text (too many control characters or combining marks).

    Scoring factors:

    * Base score: ratio of Unicode letters (category ``L*``) to sample length.
    * ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
      primary signal for disambiguating endianness â€” correct decoding of
      Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
    * Space bonus: +0.1 when the sample contains at least one whitespace
      character and is longer than 20 characters.
    * Rejection: returns -1.0 if >10% control characters or >20% combining
      marks (category ``M*``).
    Nr   rB   ÚLr!   é€   ÚMÚZsrU   ÚCgš™™™™™¹?gš™™™™™É?r   é   )r   ÚunicodedataÚcategoryÚord)r;   rZ   r   ÚnÚlettersÚmarksÚspacesÚcontrolsÚascii_lettersrW   ÚcatÚscores               r   rF   rF   õ   sa  € ð( &5&Œ\€FÝˆF‰Œ€AØˆA‚vð Øˆtà€GØ€EØ€FØ€HØ€Màð ð ˆÝÔ" 1Ñ%Ô%ˆØˆqŒ6SŠ=ð 		Øq‰LˆGÝ1‰vŒv˜Š|ð #Ø Ñ"øØŒVsŠ]ð 	ØQ‰JˆEˆEØDŠ[ð 	˜A ˜Mð 	Øa‰KˆFˆFØŒVsŠ]ð 	Ø˜‰MˆHøð !|cÒð ØˆtØˆqy3‚ð Øˆtàa‰K€Eà	ˆm˜aÑ 3Ñ&Ñ&€Eàˆ2‚vð &˜1’*ð Ø‰ˆà€Lr   )rS   )Ú__doc__rb   Úchardet.pipeliner   r   r   r   r1   r   rD   rG   rX   r   r
   ÚbytesÚ__annotations__ÚfloatÚboolr   r   r   r   Ústrr5   ÚintrF   r"   r   r   ú<module>ru      sŽ  ðð	ð 	ð 	ð Ð Ð Ð à XÐ XÐ XÐ XÐ XÐ XÐ XÐ XÐ XÐ Xð €ð Ð ØÐ ð  Ð ð Ð ð Ð ð  $Ð ð ")Ð+;Ñ!;Ð ˜Ð ;Ð ;Ñ ;ð= Uð =°uð =Àð =ð =ð =ð =ð(  %ð  ¨O¸dÑ,Bð  ð  ð  ð  ð,5uð 5 °4Ñ!7ð 5ð 5ð 5ð 5ðpQuð Q °4Ñ!7ð Qð Qð Qð Qðh=˜3ð = 4ð =ð =ð =ð =ð9ð 9˜ð 9 Cð 9°%ð 9ð 9ð 9ð 9ð 9ð 9r   