o
    ji9                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZ e eZer_d
dlm Z  d
dl!m"Z" eG dd dZ#G dd de$Z%G dd dZ&dS )    N)deque)asdict	dataclassfield)TYPE_CHECKINGAnyDictIteratorListOptionalTuple)
NumberTree)PDFPage)	PDFParser)	PDFObjRefresolve1)	PSLiteral   )decode_text)Page)PDFc                   @   s   e Zd ZU eed< ee ed< ee ed< ee ed< ee ed< ee ed< ee ed< ee ed< eed	Z	e
eef ed
< eed	Zee ed< eed	Zed  ed< ded  fddZde
eef fddZdS )PDFStructElementtyperevisionidlangalt_textactual_texttitlepage_number)default_factory
attributesmcidschildrenreturnc                 C   
   t | jS Niterr#   self r+   Z/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/pdfplumber/structure.py__iter__$      
zPDFStructElement.__iter__c                 C   sx   t | }t|g}|r:| }t| D ]}|| du s)|| g ks)|| i kr,||= qd|v r8||d  |s|S )z'Return a compacted dict representation.Nr#   )r   r   popleftlistkeysextend)r*   rdelkr+   r+   r,   to_dict'   s   
$zPDFStructElement.to_dictN)__name__
__module____qualname__str__annotations__r   intr   dictr!   r   r   r0   r"   r
   r#   r	   r-   r7   r+   r+   r+   r,   r      s   
 r   c                   @   s   e Zd ZdS )StructTreeMissingN)r8   r9   r:   r+   r+   r+   r,   r?   5   s    r?   c                   @   s   e Zd ZU dZee ed< dddded fddZd	ee	e
f d
ee dee	e
f fddZd	e
deee ee
 f fddZdee
 ddfddZd	ee	e
f defddZdddZdee	e
f ddfddZdee fddZdS )PDFStructTreeaz  Parse the structure tree of a PDF.

    The constructor takes a `pdfplumber.PDF` and optionally a
    `pdfplumber.Page`.  To avoid creating the entire tree for a large
    document it is recommended to provide a page.

    This class creates a representation of the portion of the
    structure tree that reaches marked content sections, either for a
    single page, or for the whole document.  Note that this is slightly
    different from the behaviour of other PDF libraries which will
    also include structure elements with no content.

    If the PDF has no structure, the constructor will raise
    `StructTreeMissing`.

    pageNdocr   r   c                    s  |j | _ d| j jvrtdt| j jd | _t| jdi | _t| jdi | _g | _|d urp|j	| _
d | _| jd}|d u rI|   d S t|}d| j
jvrUd S | j
jd  tt fdd|jD }| | d S d | _
d	d
 |jD | _|   d S )NStructTreeRootzPDF has no structureRoleMapClassMap
ParentTreeStructParentsc                 3   s     | ]\}}| kr|V  qd S r&   r+   ).0numarray	parent_idr+   r,   	<genexpr>n   s    z)PDFStructTree.__init__.<locals>.<genexpr>c                 S   s   i | ]}|j j|jqS r+   )page_objpageidr   )rH   rA   r+   r+   r,   
<dictcomp>t   s    z*PDFStructTree.__init__.<locals>.<dictcomp>)rB   catalogr?   r   rootgetrole_map	class_mapr#   rN   rA   	page_dict_parse_struct_treer   attrsnextvalues_parse_parent_treepages)r*   rB   rA   parent_tree_objparent_treeparent_arrayr+   rK   r,   __init__M   s4   zPDFStructTree.__init__objr   r$   c                 C   s,  g }dD ]}||vrqt || }t|tr|| q|| qg }d }|D ]$}t|tr?||kr<|d ur<|| d }q(|d urH|| t |}q(|d urV|| i }	|D ]9}t|trwt|j}|| j	vrrt
d| qZ| j	| }| D ]\}
}t|trt|j|	|
< q{||
 |	|
< q{qZ|	S )N)CAzUnknown attribute class %s)r   
isinstancer0   r2   appendr=   r   r   namerU   loggerwarningitems)r*   ra   r   attr_obj_listkeyattr_obj	attr_objsprev_objarefattrr6   vr+   r+   r,   _make_attributesy   sD   










zPDFStructTree._make_attributesc                 C   s  d|vs
J d| d|vsJ d| d }| j d ur4d|v r4|d j}|| j v s/J d| | j | }d}d|v rNt|d j}|| jv rNt| j| j}d	|v rXt|d	 ng }t|trc|g}n
t|trm|d	 g}|	d
}| 
||}d|v rt|d nd }d|v rt|d nd }	d|v rt|d nd }
d|v rt|d nd }d|v rt|d nd }t|||||
|	|||d	}||fS )NMCIDzUncaught MCR: %sObjzUncaught OBJR: %sPgzObject on unparsed page: %s SKRIDTLangAlt
ActualText)	r   r   r   r   r   r   r   r   r!   )rV   objidr   rf   rT   r   rd   r=   r>   rS   rr   r   )r*   ra   r   
page_objidobj_tagr#   r   r!   
element_idr   r   r   r   elementr+   r+   r,   _make_element   sH   






zPDFStructTree._make_elementr_   c           	      C   s   t |}i }d}|rM| }|tjkrqt||v rqt|}d|v r/t|d jdkr/d}n| |\}}|dus<J ||f|t|< |	|d  |s
|sQJ | 
| dS )zYPopulate the structure tree using the leaves of the parent tree for
        a given page.FTyperC   TNP)r   r/   r   KEYWORD_NULLreprr   r   rf   r   re   _resolve_children)	r*   r_   r4   s
found_rootrefra   r   r#   r+   r+   r,   r[      s&   
z PDFStructTree._parse_parent_treec                 C   sH   d|vrdS |d j }| jd ur|| jv S | jd ur"|| jjkr"dS dS )Nru   TF)r   rV   rA   rO   )r*   ra   r   r+   r+   r,   on_parsed_page   s   



zPDFStructTree.on_parsed_pagec                    s2  t jd }t|trjd g}t|}i |r}| }t|v r%qt |}t|tr@d|v r@|s8q|d }t |}|\}}||ft|< |D ])}t |}t|trp|sbqQd|v rk|d }nd|v rpqQt|t	rz|
| qQ|sdtt dtt f fdd  |  dS )	zgPopulate the structure tree starting from the root, skipping
        unparsed pages and empty elements.rx   rt   rs   elementsr$   c                    s   g }| D ]U}t |}t|tr|| qt|tr4|s qd|v r,||d  qd|v r4|d }t| \}} |}|d u sF|sLt|= q||ft|< || q|S )Nrs   rt   )r   rd   r=   re   r>   r   r   )r   next_elementsr   ra   r   r#   pruner   r*   r+   r,   r   %  s*   



z/PDFStructTree._parse_struct_tree.<locals>.pruneN)r   rR   rd   r>   r   r/   r   r   r   r   re   r
   r   r   )r*   rR   r4   r   ra   r   r#   childr+   r   r,   rW      sB   






"z PDFStructTree._parse_struct_treeseenc                    sr  t | jd }t|tr| jd g}g | _g }|D ]$}t |}t|tr2d|v r2| |s.q|d }t| v r=|| qt|}|r|	 } t| \}}|dusXJ d|D ]P}	t |	}t|t
rl|j| n t|tr| |swqZd|v r|j|d  nd|v r|d }	t|	tr t|	d\}
}|
dur|j|
 ||	 qZ|sD fdd|D | _dS )	z|Resolve children starting from the tree root based on references we
        saw when traversing the structure tree.
        rx   rt   NzUnparsed elementrs   )NNc                    s   g | ]
} t | d  qS )r   )r   )rH   r   r   r+   r,   
<listcomp>k  s    z3PDFStructTree._resolve_children.<locals>.<listcomp>)r   rR   rd   r>   r#   r   r   re   r   r/   r=   r"   r   rS   )r*   r   rR   parsed_rootr   ra   r4   r   r#   r   child_element_r+   r   r,   r   A  sL   







zPDFStructTree._resolve_childrenc                 C   r%   r&   r'   r)   r+   r+   r,   r-   m  r.   zPDFStructTree.__iter__r&   )r$   N)r8   r9   r:   __doc__r   r   r<   r`   r   r;   r   r=   rr   r   r   r
   r   r[   boolr   rW   r   r	   r-   r+   r+   r+   r,   r@   9   s"   
 ,


"1(
D,r@   )'loggingcollectionsr   dataclassesr   r   r   typingr   r   r   r	   r
   r   r   pdfminer.data_structuresr   pdfminer.pdfpager   pdfminer.pdfparserr   pdfminer.pdftypesr   r   pdfminer.psparserr   utilsr   	getLoggerr8   rg   rA   r   pdfr   r   
ValueErrorr?   r@   r+   r+   r+   r,   <module>   s$    $
