o
    žªj÷>  ã                   @   sê  U d Z ddlmZ ddlmZmZ ddlmZ dede	e
eef fdd„Zdede	e
eef fd	d
„Zdede	e
eef fdd„Zdede	e
eef fdd„Zdede	e
eef fdd„Zdede	e
eef fdd„Zdede	e
eef fdd„Zdede	e
eef fdd„Zdede	e
eef fdd„ZeeeeeeeedœZeeeege	e
eef f f ed< dededede	e
eef dB fdd„Zded edede
fd!d"„Z	d(ded eded#edB de
f
d$d%„Zded ededefd&d'„ZdS ))a™  Stage 2b: Multi-byte structural probing.

Computes how well byte patterns in the data match the expected multi-byte
structure for a given encoding.  Used after byte-validity filtering (Stage 2a)
to further rank multi-byte encoding candidates.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
é    )ÚCallable)Ú
HIGH_BYTESÚPipelineContext)ÚEncodingInfoÚdataÚreturnc           
      C   s  d}d}d}t ƒ }d}t| ƒ}||k r|| | }d|  kr!dks.n d|  kr,dkrtn nF|d7 }|d |k ro| |d  }d|  krHdksUn d	|  krSd
kron n|d7 }| |¡ |d7 }|dkrj|d7 }|d7 }q|d7 }n|d7 }||k s|dkr„|| nd}	|	|t|ƒfS )z°Single-pass Shift_JIS structural analysis.

    Lead bytes: 0x81-0x9F, 0xE0-0xEF
    Trail bytes: 0x40-0x7E, 0x80-0xFC

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   é   éŸ   éà   éï   é   é@   é~   é€   éü   é   é   ç        ©ÚsetÚlenÚadd©
r   Ú
lead_countÚvalid_countÚmbÚleadsÚiÚlengthÚbÚtrailÚratio© r"   úa/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/chardet/pipeline/structural.pyÚ_analyze_shift_jis   s2   
..

ïr$   c           
      C   s  d}d}d}t ƒ }d}t| ƒ}||k r|| | }d|  kr!dks.n d|  kr,dkrtn nF|d7 }|d |k ro| |d  }d|  krHdksUn d	|  krSdkron n|d7 }| |¡ |d7 }|d
krj|d7 }|d7 }q|d7 }n|d7 }||k s|dkr„|| nd}	|	|t|ƒfS )aB  Single-pass CP932 structural analysis.

    Lead bytes: 0x81-0x9F, 0xE0-0xFC
    Trail bytes: 0x40-0x7E, 0x80-0xFC

    Extends Shift_JIS by raising the lead byte ceiling from 0xEF to 0xFC,
    covering IBM vendor-defined characters (NEC-selected, IBM extensions).

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r"   r"   r#   Ú_analyze_cp932E   s2   ..

ïr%   c           	      C   sÎ  d}d}d}t ƒ }d}t| ƒ}||k rÖ| | }|dkrL|d7 }|d |k rGd| |d    kr3dkrGn n|d7 }| |¡ |d7 }|d7 }q|d7 }n†|dkr‘|d7 }|d |k rŒd| |d    krhdkrŒn n"d| |d    krxdkrŒn n|d7 }| |¡ |d	7 }|d	7 }q|d7 }nAd|  kr›dkrÎn n1|d7 }|d |k rÉd| |d    krµdkrÉn n|d7 }| |¡ |d7 }|d7 }q|d7 }n|d7 }||k s|dkrÞ|| nd
}||t|ƒfS )zóSingle-pass EUC-JP structural analysis.

    Two-byte: Lead 0xA1-0xFE, Trail 0xA1-0xFE
    SS2 (half-width katakana): 0x8E + 0xA1-0xDF
    SS3 (JIS X 0212): 0x8F + 0xA1-0xFE + 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   éŽ   r   é¡   éß   r   é   éþ   é   r   r   ©	r   r   r   r   r   r   r   r   r!   r"   r"   r#   Ú_analyze_euc_jpn   sR   ,

  

,

Ü%r-   c           	      C   sÚ   d}d}d}t ƒ }d}t| ƒ}||k r\| | }d|  kr!dkrTn n1|d7 }|d |k rOd| |d    kr;dkrOn n|d7 }| |¡ |d7 }|d7 }q|d7 }n|d7 }||k s|dkrd|| nd}||t|ƒfS )z†Single-pass EUC-KR structural analysis.

    Lead 0xA1-0xFE; Trail 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r'   r*   r   r   r   r   r,   r"   r"   r#   Ú_analyze_euc_kr¨   s*   	,

ôr.   c           
      C   s0  d}d}d}t ƒ }d}t| ƒ}||k r‡| | }d|  kr!dks.n d|  kr,dkrn nQ|d7 }|d |k rz| |d  }d|  krHdks`n d	|  krSd
ks`n d|  kr^dkrzn n|d7 }| |¡ |d7 }|dkru|d7 }|d7 }q|d7 }n|d7 }||k s|dkr|| nd}	|	|t|ƒfS )at  Single-pass CP949 (Unified Hangul Code) structural analysis.

    Lead bytes: 0x81-0xC8, 0xCA-0xFD
    Trail bytes: 0x41-0x5A, 0x61-0x7A, 0x81-0xFE

    Extends EUC-KR by lowering the lead byte floor from 0xA1 to 0x81 and
    adding ASCII letter trail ranges plus 0x81-0xA0.  0xC9 is not a valid
    UHC lead byte.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r   éÈ   éÊ   éý   r   éA   éZ   éa   éz   r*   r   r   r   r   r   r"   r"   r#   Ú_analyze_cp949È   s6   .

ër6   c           	      C   s‚  d}d}d}t ƒ }d}t| ƒ}||k r°| | }d|  kr!dkr¨n n…|d7 }|d |k rod| |d    kr;dkron n2d| |d    krKdkron n"d| |d    kr[dkron n|d7 }| |¡ |d7 }|d	7 }qd
|  krydkr£n n(|d |k r£d
| |d    krdkr£n n|d7 }| |¡ |d7 }|d7 }q|d7 }n|d7 }||k s|dkr¸|| nd}||t|ƒfS )aÕ  Single-pass GB18030 / GB2312 structural analysis.

    Only counts strict GB2312 2-byte pairs (lead 0xA1-0xF7, trail 0xA1-0xFE)
    and GB18030 4-byte sequences.  The broader GBK extension range
    (lead 0x81-0xFE, trail 0x40-0x7E / 0x80-0xFE) is intentionally excluded
    because it is so permissive that unrelated single-byte data (EBCDIC, DOS
    codepages, etc.) can score 1.0, leading to false positives.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r   r*   r   r+   é0   é9   r   é   r'   é÷   r   r   r,   r"   r"   r#   Ú_analyze_gb18030ö   s<      
D

çr;   c           
      C   s  d}d}d}t ƒ }d}t| ƒ}||k rq| | }d|  kr!dkrin nF|d7 }|d |k rd| |d  }d|  kr=dksJn d|  krHdkrdn n|d7 }| |¡ |d7 }|dkr_|d7 }|d	7 }q|d7 }n|d7 }||k s|dkry|| nd
}	|	|t|ƒfS )zSingle-pass Big5 structural analysis.

    Lead 0xA1-0xF9; Trail 0x40-0x7E, 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r'   éù   r   r   r   r*   r   r   r   r   r   r"   r"   r#   Ú_analyze_big5'  s2   	.

ïr=   c           
      C   s  d}d}d}t ƒ }d}t| ƒ}||k rq| | }d|  kr!dkrin nF|d7 }|d |k rd| |d  }d|  kr=dksJn d|  krHdkrdn n|d7 }| |¡ |d7 }|dkr_|d7 }|d	7 }q|d7 }n|d7 }||k s|dkry|| nd
}	|	|t|ƒfS )aW  Single-pass Big5-HKSCS structural analysis.

    Lead bytes: 0x87-0xFE
    Trail bytes: 0x40-0x7E, 0xA1-0xFE

    Extends Big5 by lowering the lead byte floor from 0xA1 to 0x87 and
    raising the ceiling from 0xF9 to 0xFE.  0x7F and 0x80-0xA0 are not
    valid Big5/HKSCS trail bytes.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   é‡   r*   r   r   r   r'   r   r   r   r   r   r"   r"   r#   Ú_analyze_big5hkscsL  s2   .

ïr?   c           
      C   s8  d}d}d}t ƒ }d}t| ƒ}||k r‹| | }d|  kr!dks9n d|  kr,dks9n d|  kr7dkrƒn nJ|d7 }|d |k r~| |d  }d	|  krSd
ks`n d|  kr^dkr~n n|d7 }| |¡ |dkrq|d7 }|dkry|d7 }|d7 }q|d7 }n|d7 }||k s|dkr“|| nd}	|	|t|ƒfS )z«Single-pass Johab structural analysis.

    Lead: 0x84-0xD3, 0xD8-0xDE, 0xE0-0xF9
    Trail: 0x31-0x7E, 0x91-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   é„   éÓ   éØ   éÞ   r
   r<   r   é1   r   é‘   r*   r   r   r   r   r   r"   r"   r#   Ú_analyze_johabv  s4   
D.

ïrF   )Úshift_jis_2004Úcp932Úeuc_jis_2004Úeuc_krÚcp949Úgb18030Ú	big5hkscsÚjohabÚ
_ANALYZERSÚnameÚctxNc                 C   sD   |j  |¡}|dur|S t |¡}|du rdS || ƒ}||j |< |S )z/Return cached analysis or compute and cache it.N)Úanalysis_cacheÚgetrO   )r   rP   rQ   ÚcachedÚanalyzerÚresultr"   r"   r#   Ú_get_analysis¬  s   

rW   Úencoding_infoc                 C   s0   | r|j sdS t| |j|ƒ}|du rdS |d S )a•  Return 0.0--1.0 indicating how well *data* matches the encoding's structure.

    For single-byte encodings, always returns 0.0.  For empty data, always
    returns 0.0.

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :returns: A structural fit score between 0.0 and 1.0.
    r   Nr   ©Úis_multibyterW   rP   ©r   rX   rQ   rV   r"   r"   r#   Úcompute_structural_scoreÀ  s   
r\   Únon_ascii_countc                 C   sh   | r|j sdS t| |j|ƒ}|du rdS |d }|dur|nt| ƒt|  dt¡ƒ }|dkr0dS || S )av  Ratio of non-ASCII bytes that participate in valid multi-byte sequences.

    Genuine CJK text has nearly all non-ASCII bytes paired into valid
    multi-byte sequences (coverage close to 1.0), while Latin text with
    scattered high bytes has many orphan bytes (coverage well below 1.0).

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :param non_ascii_count: Pre-computed count of non-ASCII bytes, or ``None``
        to compute from *data*.
    :returns: A coverage ratio between 0.0 and 1.0.
    r   Nr   r   )rZ   rW   rP   r   Ú	translater   )r   rX   rQ   r]   rV   Úmb_bytesÚ	non_asciir"   r"   r#   Úcompute_multibyte_byte_coverage×  s   
ÿýra   c                 C   s0   | r|j sdS t| |j|ƒ}|du rdS |d S )aá  Count distinct lead byte values in valid multi-byte pairs.

    Genuine CJK text uses lead bytes from across the encoding's full
    repertoire.  European text falsely matching a CJK structural scorer
    clusters lead bytes in a narrow band.

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :returns: The number of distinct lead byte values found.
    r   Né   r   rY   r[   r"   r"   r#   Úcompute_lead_byte_diversityþ  s   
rc   )N)Ú__doc__Úcollections.abcr   Úchardet.pipeliner   r   Úchardet.registryr   ÚbytesÚtupleÚfloatÚintr$   r%   r-   r.   r6   r;   r=   r?   rF   rO   ÚdictÚstrÚ__annotations__rW   r\   ra   rc   r"   r"   r"   r#   Ú<module>   sÂ    ÿ
þ&ÿ
þ)ÿ
þ:ÿ
þ ÿ
þ.ÿ
þ1ÿ
þ%ÿ
þ*ÿ
þ+*øÿÿÿ
þÿÿÿ
þüÿþýü
û'ÿÿÿþ