o
    j5,                     @   s   U d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
d	Zd
Zde Zeed< dededefddZdededB fddZdededB fddZdededB fddZdedefddZddededefddZdS ) a  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)ASCII_TEXT_BYTESDETERMINISTIC_CONFIDENCEDetectionResulti      
   gQ?      ?gffffff?g333333?    _NULL_SEPARATOR_ALLOWEDdata	null_fracreturnc                 C   s   |t krdS | dt S )u  Return True if the data looks like ASCII with null byte separators.

    :param data: The raw byte sample to examine.
    :param null_frac: The positional null fraction for this UTF-16 candidate
        (i.e. fraction of null bytes in even positions for BE, or odd positions
        for LE) — not the total null fraction across all bytes.

    Checks two conditions:
    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
    2. Every non-null byte is printable ASCII or common whitespace

    When both conditions are met, the nulls are likely field separators
    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
    FN)_NULL_SEPARATOR_MAX_FRACTION	translater	   )r
   r    r   ^/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/pipeline/utf1632.py_is_null_separator_pattern6   s   r   c                 C   s8   | dt  }t|tk rdS t|}|dur|S t|S )a  Detect UTF-32 or UTF-16 encoding from null-byte patterns.

    UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
    N)_SAMPLE_SIZElen_MIN_BYTES_UTF16_check_utf32_check_utf16)r
   sampleresultr   r   r   detect_utf1632_patternsJ   s   r   c                    s`  t  t  d  }|tk rdS  d|  |d }t fddtdt  dD }t fddtdt  dD }||krc|| dkrcz d}t|rXtdtdd	W S W n	 tyb   Y nw t fd
dtdt  dD }t fddtdt  dD }||kr|| dkrz d}t|rtdtdd	W S W dS  ty   Y dS w dS )a  Check for UTF-32 encoding based on 4-byte unit structure.

    For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
    - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
    - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

    For BMP characters (U+0000 to U+FFFF), additionally:
    - UTF-32-BE: the second byte is also 0x00
    - UTF-32-LE: the third byte is also 0x00
       Nc                 3        | ]} | d krdV  qdS r      Nr   .0ir
   r   r   	<genexpr>t       z_check_utf32.<locals>.<genexpr>r   c                 3   s$    | ]} |d   dkrd V  qdS )r   r   Nr   r   r!   r   r   r"   v      " r   z	utf-32-beencoding
confidencelanguagec                 3   r   r   r   r   r!   r   r   r"      r#      c                 3   r   r   r   r   r!   r   r   r"      r#      z	utf-32-le)	r   _MIN_BYTES_UTF32sumrangedecode_looks_like_textr   r   UnicodeDecodeError)r
   trimmed_len	num_unitsbe_first_nullbe_second_nulltextle_last_nullle_third_nullr   r!   r   r   `   sL   ""
""
	r   c              	      s  t t t}||d 8 }|tk rdS |d }t fddtd|dD }t fddtd|dD }|| }|| }g }|tkrSt d| |sS|d|f |tkrgt d| |sg|d	|f |skdS t|dkr|d d }z d| 	|}	t
|	rt|tdd
W S W dS  ty   Y dS w d}
d}|D ]%\}}z d| 	|}	W n	 ty   Y qw t|	}||kr|}|}
q|
dur|tkrt|
tdd
S dS )a  Check for UTF-16 via null-byte patterns in alternating positions.

    UTF-16 encodes each BMP character as two bytes.  For characters whose
    code-point high byte is 0x00 (Latin, digits, basic punctuation, many
    control structures), one of the two bytes in each unit will be a null.
    Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
    fraction of code units still contain at least one null byte.

    Non-UTF-16 single-byte encodings never contain null bytes, so even a
    small null-byte fraction in alternating positions is a strong signal.

    When both endiannesses show null-byte patterns (e.g., Latin text where
    every other byte is null), we disambiguate by decoding both ways and
    comparing text-quality scores.
    r*   Nc                 3   r   r   r   r   r!   r   r   r"      r#   z_check_utf16.<locals>.<genexpr>r   c                 3   r   r   r   r   r!   r   r   r"      r#   r   z	utf-16-lez	utf-16-ber%         )minr   r   r   r,   r-   _UTF16_MIN_NULL_FRACTIONr   appendr.   r/   r   r   r0   _text_quality_MIN_TEXT_QUALITY)r
   
sample_lenr2   be_null_countle_null_countbe_fracle_frac
candidatesr&   r5   best_encodingbest_quality_qualityr   r!   r   r      sp   

r   r5   c                 C   s6   | sdS | dd }t dd |D }|t| tkS )z9Quick check: is decoded text mostly printable characters.FN  c                 s   s$    | ]}|  s|d v rdV  qdS )
	r   N)isprintable)r   cr   r   r   r"      r$   z#_looks_like_text.<locals>.<genexpr>)r,   r   _MIN_PRINTABLE_FRACTION)r5   r   	printabler   r   r   r/      s
   r/   rH   limitc                 C   s
  | d| }t |}|dkrdS d}d}d}d}d}|D ]>}	t|	}
|
d dkr8|d7 }t|	dk r7|d7 }q|
d dkrC|d7 }q|
dksK|	d	v rP|d7 }q|
d d
krZ|d7 }q|| dkrcdS || dkrkdS || }||| d 7 }|dkr|dkr|d7 }|S )u  Score how much *text* looks like real human-readable content.

    Returns a score in the range [-1.0, ~1.6).  Higher values indicate
    more natural text.  The practical maximum is 1.5 for all-ASCII-letter
    input (1.6 approaches as sample size grows with all ASCII letters plus
    whitespace).  A score of -1.0 means the content is almost certainly not
    valid text (too many control characters or combining marks).

    Scoring factors:

    * Base score: ratio of Unicode letters (category ``L*``) to sample length.
    * ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
      primary signal for disambiguating endianness — correct decoding of
      Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
    * Space bonus: +0.1 when the sample contains at least one whitespace
      character and is longer than 20 characters.
    * Rejection: returns -1.0 if >10% control characters or >20% combining
      marks (category ``M*``).
    Nr   r8   Lr      MZsrI   Cg?g?r      )r   unicodedatacategoryord)r5   rN   r   nlettersmarksspacescontrolsascii_lettersrK   catscorer   r   r   r<      s@   


r<   )rH   )__doc__rU   chardet.pipeliner   r   r   r   r+   r   r:   r=   rL   r   r	   bytes__annotations__floatboolr   r   r   r   strr/   intr<   r   r   r   r   <module>   s"    8T	