o
    j\                  	   @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d	d
lm Z  d	dl!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z' d	dl(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ d	dl m0Z0m1Z1m2Z2 d	dl3m4Z4 e 5dZ6e7g dZ8erd	dl9m:Z: d	dl;m<Z< ddddddZ=de>de?fddZ@deed f dee
eeeAeBf d f  e
e? f fd!d"ZCdedee
eeeAeBf d f  e
e? f fd#d$ZDd%ee?ef dee?ef fd&d'ZEG d(d) d)eZFd=d*e"d+e#de"fd,d-ZGd*e"d.e#de"fd/d0ZHG d1d2 d2e'ZIG d3d4 d4eIZJd5e"d6e"ddfd7d8ZKG d9d: d:eJZLG d;d< d<eJZMdS )>    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)PDFPageAggregator)LTCharLTComponentLTContainerLTCurveLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)PDFStructTreeStructTreeMissing)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)TextMapz^LT)advheight	linewidthptssizesrcsizewidthx0x1y0y1bitsmatrixuprightfontnametext	imagemask
colorspaceevenoddfillnon_stroking_colorstrokestroking_colorstreammcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r8   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r8   split_atprefixsuffix
suffix_new rS   U/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytesW   s   
rU   color.c                 C   s4   t | d tr| d d pd t| d jfS | d fS )NrJ   )
isinstancer   r&   name)rV   rS   rS   rT   separate_patternb   s   rY   c                 C   sJ   | d u rdS t | tr| }t|S t | trt| }t|S | f}t|S )N)NN)rW   tuplelistrY   )rV   	tuplefiedrS   rS   rT   normalize_colork   s   

r]   kwargsc                 C   s   dd |   D S )Nc                 S   s(   i | ]\}}|t |trt|n|qS rS   )rW   r[   rZ   ).0keyvaluerS   rS   rT   
<dictcomp>z   s    z'tuplify_list_kwargs.<locals>.<dictcomp>)items)r^   rS   rS   rT   tuplify_list_kwargsy   s   rd   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagrC   propsrF   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.MCIDN)r&   rX   rg   rW   dictrf   )selfrC   rh   rS   rS   rT   	begin_tag   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)rg   rf   rk   rS   rS   rT   end_tag   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s,   | j jr| j jd }| j|_| j|_dS dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rJ   N)cur_item_objsrf   rB   rg   rC   )rk   cur_objrS   rS   rT   tag_cur_item   s
   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charrr   )rk   argsr^   r*   	__class__rS   rT   rt      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rs   render_imagerr   rk   ru   r^   rv   rS   rT   ry         z/PDFPageAggregatorWithMarkedContent.render_imagec                    rx   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rs   
paint_pathrr   rz   rv   rS   rT   r|      r{   z-PDFPageAggregatorWithMarkedContent.paint_pathNrF   N)__name__
__module____qualname____doc__rf   r	   int__annotations__rg   rN   r   r   rl   rn   rr   floatrt   ry   r|   __classcell__rS   rS   rv   rT   re      s   
 

re   box_rawrotationc                 C   sP   t | d | d f\}}t | d | d f\}}|dv r"||||fS ||||fS )Nr   rI   r      )Z   i  )sorted)r   r   r1   r2   r3   r4   rS   rS   rT   _normalize_box   s
   r   	mb_heightc                 C   s    | \}}}}||| ||| fS r}   rS   )r   r   r1   r3   r2   r4   rS   rS   rT   _invert_box      r   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dgddd	ed
edefddZdhddZedefddZedefddZedeeeef  fddZedefddZedefddZedefddZedeeef fddZdeeef deeef fd d!Zd"edefd#d$Z d%ee! de"eddf fd&d'Z#deeef fd(d)Z$	did*e%e& de'fd+d,Z(	did*e%e& dee) fd-d.Z*	did*e%e& de%e) fd/d0Z+	did*e%e& deeee%e    fd1d2Z,	did*e%e& de%eee%e    fd3d4Z-d5ede.fd6d7Z/					djd8e0ee1e f d9e	d:e	d;ed<e	d=e	d5edeeeef  fd>d?Z2d5edefd@dAZ3d5edefdBdCZ4d5edefdDdEZ5	dkdFe	d<e	d5edefdGdHZ6	dldJe7dKe	dLe	ddMfdNdOZ8	dldJe7dKe	dLe	ddMfdPdQZ9	dldJe7dKe	dLe	ddMfdRdSZ:dTe;ege	f ddUfdVdWZ<d5eddUfdXdYZ=				I	IdmdZe%e0ee>f  d[e%e0ee>f  d\e%e0ee>f  d]e	d^e	dd_fd`daZ?didbe%ee  deeef fdcddZ@defdedfZAdS )nPage_layoutcached_propertiesTis_originalNr   pdfrE   page_objpage_numberinitial_doctopc           	         s   || _ | | _ | _|| _|| _ddtdtdtf fdd}|dd}|d | _t|d	| j}|d
 |d  }t	||| _
d jv rOt	t|d| j|| _n| j
| _| j
| _t | j| _d S )Nr`   defaultrF   c                    s     j | }|d u r|S t|S r}   )attrsrM   r'   )r`   r   refr   rS   rT   get_attr   r   zPage.__init__.<locals>.get_attrRotater   ih  MediaBoxr   r   CropBoxr}   )r   	root_pager   r   r   rN   r   r   r   r   mediaboxr   cropboxbboxr   _get_textmapget_textmap)	rk   r   r   r   r   r   	_rotationmb_rawr   rS   r   rT   __init__   s$   


zPage.__init__rF   c                 C   s   |    d S r}   )flush_cacherm   rS   rS   rT   close   s   z
Page.closec                 C      | j d | j d  S )NrI   r   r   rm   rS   rS   rT   r0         z
Page.widthc                 C   r   )Nr   r   r   rm   rS   rS   rT   r+      r   zPage.heightc                 C   s0   zdd t | j| D W S  ty   g  Y S w )z-Return the structure tree for a page, if any.c                 S   s   g | ]}|  qS rS   )to_dict)r_   elemrS   rS   rT   
<listcomp>  s    z'Page.structure_tree.<locals>.<listcomp>)r    r   r!   rm   rS   rS   rT   structure_tree  s
   zPage.structure_treec                 C   sR   t | dr| jS t| jj| j| jjd}t| jj|}|| j	 |
 | _| jS )Nr   )pagenolaparams)hasattrr   re   r   rsrcmgrr   r   r   process_pager   
get_result)rk   deviceinterpreterrS   rS   rT   layout	  s   

zPage.layoutc                    s4   dt dt f fdd}t jjpg }tt||S )NannotrF   c                    s  | d }|  di }| d|  d|  dd}| D ]"\}}|d ur?z	|d||< W q ty>   |d||< Y qw q jd	|d
 |d |d |d  j j |d   j|d   j|d  |d |d
  |d |d  d}|| d| v r | d< | |d< |S )NRectAURITContents)urititlecontentszutf-8zutf-16r   r   r   rI   r   )r   object_typer1   r3   r2   r4   doctoptopbottomr0   r+   Pdata)rM   rc   decodeUnicodeDecodeErrorr   r   r+   update)r   rectaextraskvparsedrm   rS   rT   parse  s>   
zPage.annots.<locals>.parse)r   r'   r   annotsr[   map)rk   r   rawrS   rm   rT   r     s   %zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrS   )r_   r   rS   rS   rT   r   C  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rm   rS   rS   rT   
hyperlinksA  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrm   rS   rS   rT   objectsE  s   

zPage.objectsptc                 C   s   |d | j |d  fS )Nr   r   )r+   )rk   r   rS   rS   rT   point2coordL     zPage.point2coordobjc           	         s  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d<  j|d< dD ]}t||rGtt||j||< q6d	D ]\}}||v r^t|| \||< ||< qJt|ttfrl| |d
< t|tr|j}t|j\|d< |d< t|j\|d< |d< t|d trt|d |d< n#t|tfrtt j |d |d<  fdd|j!D |d< |j"|d< d|v r܈ j#|d  |d<  j#|d  |d<  j$|d  |d< |S )N itemrF   c                 S   s$   | \}}|t v rt|}||fS d S r}   )	ALL_ATTRSr'   )r   r   r   resrS   rS   rT   process_attrR  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsscs))r@   stroking_pattern)r>   non_stroking_patternr9   r@   r   r>   r   r8   r-   c                    s$   g | ]^}}|gt  j|R qS rS   )r   r   )r_   cmdr-   rm   rS   rT   r     s   $ z'Page.process_object.<locals>.<listcomp>pathdashr3   r4   r   r   r   )%resublt_patrw   r   lowerr   rN   r   r	   rj   filterr   __dict__rc   r   r   r(   getattrrX   r]   rW   r   r   get_textgraphicstatescolorncolorbytesrU   r   r[   r   original_pathdashing_styler+   r   )	rk   r   kindr   attrcs
color_attrpattern_attrgsrS   rm   rT   process_objectO  sF   &



zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S r}   )rW   r   r   r   r   iter_layout_objectsrp   )rk   r   r   rS   rS   rT   r    s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r  r   rp   rM   append)rk   r   r   r   rS   rS   rT   r     s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S r}   )r%   resolver$   rk   r  tsetrS   rS   rT   debug_tablefinder  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS r}   )r%   r  r$   tablesr  rS   rS   rT   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   xrF   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r  rS   rS   rT   sorter  s   zPage.find_table.<locals>.sorter)r`   )
r%   r  r
  r  r#   r   r   r   r[   r   )rk   r  r  r	  r  largestrS   rS   rT   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rS   )extracttext_settings)r_   tabler  rS   rT   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r%   r  r
  )rk   r  r	  rS   r  rT   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrS   )r%   r  r  r  r  )rk   r  r  r  rS   rS   rT   extract_table  s
   

zPage.extract_tabler^   c                 K   s\   t | jd}d|vr|d| ji d|vr|d| ji i ||}tj| jfi |S )N)layout_bboxlayout_width_charslayout_widthlayout_height_charslayout_height)rj   r   r   r0   r+   r   chars_to_textmapchars)rk   r^   defaultsfull_kwargsrS   rS   rT   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s*   | j di t|}|j||||||dS )N)r#  r$  r%  r&  r'  rS   )r   rd   search)	rk   r"  r#  r$  r%  r&  r'  r^   textmaprS   rS   rT   r(    s   
zPage.searchc                 K   s   | j di t|jS r  )r   rd   	as_stringrk   r^   rS   rS   rT   extract_text  r   zPage.extract_textc                 K      t j| jfi |S r}   )r   extract_text_simpler  r+  rS   rS   rT   r.       zPage.extract_text_simplec                 K   r-  r}   )r   extract_wordsr  r+  rS   rS   rT   r0    r/  zPage.extract_wordsstripc                 K   s   | j di t|j||dS )N)r1  r&  rS   )r   rd   extract_text_lines)rk   r1  r&  r^   rS   rS   rT   r2    s   zPage.extract_text_linesFr   relativestrictCroppedPagec                 C   s   t | |||dS )N)r3  r4  )r5  rk   r   r3  r4  rS   rS   rT   crop  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )r3  r4  crop_fn)r5  r   within_bboxr6  rS   rS   rT   r;       zPage.within_bboxc                 C   r8  r9  )r5  r   outside_bboxr6  rS   rS   rT   r=    r<  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S r}   )r?  )rk   r>  rS   rS   rT   r   !     
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text, fontname, size,
        and positioning (within `tolerance`) as other characters on the page.
        c                 S   s   dS )NTrS   r  rS   rS   rT   <lambda>)  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rS   rS   )r_   r   objsrS   rS   rT   rb   *  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)r?  r   rc   r   r   dedupe_charsr  )rk   r^   prS   rS   rT   rD  $  s   zPage.dedupe_chars
resolutionr0   r+   	antialiasforce_mediaboxrD   c           	      C   s   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;|||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONrD   c                 s   s    | ]}|d uV  qd S r}   rS   )r_   r  rS   rS   rT   	<genexpr>>  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )rF  rG  rH  )displayrI  rD   sum
ValueErrorr0   r+   )	rk   rF  r0   r+   rG  rH  rI  rD   	num_specsrS   rS   rT   to_image.  s    zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r   r   r   r   r0   r+   s)r[   r   keysr   r   r   r   r   r   r0   r+   r   )rk   rQ  _object_typesdtrS   rS   rT   r   O  s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rm   rS   rS   rT   __repr__b  s   zPage.__repr__r   r~   r}   )TTr   TT)TT)FT)NNNFF)Br   r   r   r   r   r   rN   r   r   boolpagesr   r   r   r   r   propertyr0   r+   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r	   r"   r$   r  r#   r
  r  r  r  r)   r   r   r
   r(  r,  r.  r0  r2  r   r7  r;  r=  r   r   rD  r   rP  r   rX  rS   rS   rS   rT   r      s6  
 

))"C






	






$!r   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sT   || _ |j| _|j| _|j| _|j| _|j| _|j| _| tj	 t
 | j| _d S r}   )r^  r   r   r   r   r   r   r   r   r   r   r   r   )rk   r^  rS   rS   rT   r   i  s   zDerivedPage.__init__N)r   r   r   r   rZ  r   r   r   rS   rS   rS   rT   r]  f  s   
 r]  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   calculate_arearN  get_bbox_overlap)r   r_  	bbox_areaoverlapoverlap_arearS   rS   rT   test_proposed_bboxu  s$   

re  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r5  FTr^  	crop_bboxr:  r3  r4  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )NrB  rF   c                    s
   |  S r}   rS   )rB  rf  r:  rS   rT   _crop_fn  r@  z&CroppedPage.__init__.<locals>._crop_fn)r   re  r   rs   r   rh  r   r=  )rk   r^  rf  r:  r3  r4  o_x0o_top_r1   r   r2   r   rh  rv   rg  rT   r     s   

zCroppedPage.__init__rF   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rS   )rh  r_   r   r   rm   rS   rT   rb     s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   r^  r   rc   rm   rS   rm   rT   r     s   


zCroppedPage.objects)r   r   r   r   crop_to_bboxr   r   r   r   rZ  r   r\  r   rN   r   r   rS   rS   rv   rT   r5    s"     r5  c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )r?  r^  	filter_fnc                    s   |j | _ || _t | d S r}   )r   rp  rs   r   )rk   r^  rp  rv   rS   rT   r     s   zFilteredPage.__init__rF   c                    rl  )Nr   c                    s"   i | ]\}}|t t j|qS rS   )r[   r   rp  rm  rm   rS   rT   rb     s    z(FilteredPage.objects.<locals>.<dictcomp>rn  rm   rS   rm   rT   r     s   


zFilteredPage.objects)r   r   r   r   r   r   rZ  r   r\  r   rN   r   r   r   rS   rS   rv   rT   r?    s      r?  rY  )Nr   	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.psparserr   r   r   _typingr   r   r   r   	containerr   	structurer    r!   r  r"   r#   r$   r%   r&   r'   r(   
utils.textr)   compiler   setr   rL  rD   r   rE   rL   r   rN   rU   r   r   rY   r]   rd   re   r   r   r   r]  re  r5  r?  rS   rS   rS   rT   <module>   sf    0$	
 	
"
	"
"3   !(