o
    ji                     @   s<  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZ d dl m!Z!m"Z" d dl#m$Z% d dl#m&Z&m'Z'm(Z( e)e&e' e( Z*e*+d dd Z,dd Z-dd Z.g dZ/g dZ0dOddZ1dOddZ2G dd de3Z4dd  Z5d!d" Z6d#d$ Z7d%d& Z8d'd( Z9d)d* Z:d+d, Z;d-e<fd.d/Z=d-e<fd0d1Z>d-e?fd2d3Z@d-e?fd4d5ZAdPd7d8ZBdQd9d:ZCdQd;d<ZDdRd>d?ZE	dSd@dAZFdBdC ZGdDdE ZH	F	G	F	H	F	I	IdTdJdKZIdUdMdNZJdS )V    N)groupby)
itemgetter)	PDFParser)PDFDocument)PDFPage)PDFTextExtractionNotAllowed)PDFResourceManager)PDFPageInterpreter)PDFPageAggregator)LAParamsLTAnnoLTCharLTTextLineHorizontalLTTextLineVerticalLTImage)Requesturlopen)urlparse)uses_relativeuses_netlocuses_params c                 C   s&   zt | jtv W S  ty   Y dS w )zCheck to see if a URL has a valid protocol.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    isurl : bool
        If url has a valid protocol return True otherwise False.

    F)	parse_urlscheme_VALID_URLS	Exception)url r   S/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/camelot/utils.pyis_url(   s
   r   c                 C   s4   d}| r|t tjtj tj 7 }| d8 } | s|S )Nr      )randomchoicestringdigitsascii_lowercaseascii_uppercase)lengthretr   r   r   random_string;   s   r)   c                 C   s   t d d}tjddd+}ddi}t| d|}t|}|  }|d	kr+td
||	  W d   n1 s<w   Y  t
jt
j|j|}t|j| |S )zDownload file from specified URL.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    filepath : str or unicode
        Temporary filepath.

       z.pdfwbF)deletez
User-AgentzMozilla/5.0Nzapplication/pdfzFile format not supported)r)   tempfileNamedTemporaryFiler   r   infoget_content_typeNotImplementedErrorwritereadospathjoindirnamenameshutilmove)r   filenamefheadersrequestobjcontent_typefilepathr   r   r   download_urlE   s   rB   )columnsedge_tolrow_tol
column_tol)
process_background
line_scale	copy_text
shift_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolutionlatticec                    s0    fdd} dkr|t |  d S |t|  d S )Nc                    s<   t | t | }|rtdt| d  dd S )N,z cannot be used with flavor='')setintersectionkeys
ValueErrorr6   sorted)parser_kwargsinput_kwargsisecflavorr   r   check_intersectionp   s   z*validate_input.<locals>.check_intersectionrQ   )stream_kwargslattice_kwargs)kwargsr]   r^   r   r\   r   validate_inputo   s   rb   c                 C   sP   |dkr|   D ]}|tv r| | q| S |   D ]}|tv r%| | q| S )NrQ   )rV   r_   popr`   )ra   r]   keyr   r   r   remove_extra}   s   

re   c                   @   s   e Zd Zdd Zdd ZdS )TemporaryDirectoryc                 C   s   t  | _| jS N)r-   mkdtempr8   )selfr   r   r   	__enter__   s   
zTemporaryDirectory.__enter__c                 C   s   t | j d S rg   )r9   rmtreer8   )ri   exc_type	exc_value	tracebackr   r   r   __exit__   s   zTemporaryDirectory.__exit__N)__name__
__module____qualname__rj   ro   r   r   r   r   rf      s    rf   c                 C   s   || 7 }|S )zTranslates x2 by x1.

    Parameters
    ----------
    x1 : float
    x2 : float

    Returns
    -------
    x2 : float

    r   )x1x2r   r   r   	translate      ru   c                 C   s   | |9 } | S )zScales x by scaling factor s.

    Parameters
    ----------
    x : float
    s : float

    Returns
    -------
    x : float

    r   )xsr   r   r   scale   rv   ry   c           
      C   sv   | \}}}}|\}}}t ||}t tt| ||}t ||}t tt| ||}t|t|t|t|f}	|	S )ay  Translates and scales pdf coordinate space to image
    coordinate space.

    Parameters
    ----------
    k : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
        space.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
        first two elements are scaling factors and pdf_y is height of
        pdf.

    Returns
    -------
    knew : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
        space.

    )ry   absru   int)
kfactorsrs   y1rt   y2scaling_factor_xscaling_factor_ypdf_yknewr   r   r   	scale_pdf   s   


r   c                    s  |\ i }|   D ]P}|\}}}}	t|}ttt  |}t|}ttt  |	}	t| |  \}
}fdd|
D }
 fdd|D }t|
|}||||||	f< qg }|D ]5}t|d t|d }}ttt  |d ttt  |d }}	|||||	f q`g }|D ]5}t|d t|d }}ttt  |d ttt  |d }}	|||||	f q|||fS )av  Translates and scales image coordinate space to pdf
    coordinate space.

    Parameters
    ----------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as value.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of horizontal line segments.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
        first two elements are scaling factors and img_y is height of
        image.

    Returns
    -------
    tables_new : dict
    v_segments_new : dict
    h_segments_new : dict

    c                    s   g | ]}t | qS r   )ry   .0j)r   r   r   
<listcomp>   s    zscale_image.<locals>.<listcomp>c                    s"   g | ]}t tt  |qS r   )ry   rz   ru   r   )img_yr   r   r   r      s   " r      r       )rV   ry   rz   ru   zipappend)tables
v_segments
h_segmentsr}   
tables_newr|   rs   r~   rt   r   j_xj_yjointsv_segments_newvh_segments_newhr   )r   r   r   r   scale_image   s8   




r   c                 C   sh   d}t dd |D }t dd |D }||k r2tdd | D }tdd | D }||k r0dnd	}|S )
aC  Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.

    Parameters
    ----------
    horizontal_text : list
        List of PDFMiner LTTextLineHorizontal objects.
    vertical_text : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.

    Returns
    -------
    rotation : string
        '' if text in table is upright, 'anticlockwise' if
        rotated 90 degree anticlockwise and 'clockwise' if
        rotated 90 degree clockwise.

    r   c                 S      g | ]
}|   r|qS r   get_textstripr   tr   r   r   r   (      z get_rotation.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r   r   )  r   c                 s   s,    | ]}|j d  dk o|j d dkV  qdS r    r   r   Nmatrixr   r   r   r   	<genexpr>+     * zget_rotation.<locals>.<genexpr>c                 s   s,    | ]}|j d  dko|j d dk V  qdS r   r   r   r   r   r   r   ,  r   anticlockwise	clockwise)lensum)charshorizontal_textvertical_textrotationhlenvlenr   r   r   r   r   get_rotation  s   r   c                    sP   | d | d f | d | d f fdd|D } fdd|D }||fS )a6  Returns all line segments present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
        space.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of vertical horizontal segments.

    Returns
    -------
    v_s : list
        List of vertical line segments that lie inside table.
    h_s : list
        List of horizontal line segments that lie inside table.

    r   r    r   r   c                    sd   g | ].}|d   d  d kr|d d  d k r d d |d   kr,d d krn n|qS )r    r   r   r   r   )r   r   lbrtr   r   r   I  
    Tz$segments_in_bbox.<locals>.<listcomp>c                    sd   g | ].}|d   d  d kr|d d  d k r d d |d   kr,d d krn n|qS r   r   r    r   )r   r   r   r   r   r   N  r   r   )bboxr   r   v_sh_sr   r   r   segments_in_bbox1  s   r   c                    s   | d | d f | d | d f fdd|D }dd |D }|D ](}|  D ]!}||kr0q)t||rJt||t| d	krJt||rJ|| q)q#t|}|S )
a  Returns all text objects present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.

    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects that lie inside table, discarding the overlapping ones

    r   r    r   r   c                    sx   g | ]8} d  d |j |j d   krd  d kr:n n d d |j|j d   kr6d d krn n|qS )r   r          @r    )x0rs   y0r~   r   r   r   r   r   i  s    44z text_in_bbox.<locals>.<listcomp>c                 S   s   h | ]}|qS r   r   r   r   r   r   	<setcomp>q  s    ztext_in_bbox.<locals>.<setcomp>g?)copybbox_intersectbbox_intersection_area	bbox_areabbox_longerdiscardlist)r   textt_bboxrestbabbunique_boxesr   r   r   text_in_bboxV  s$   


r   returnc                 C   s`   t | j|j}t| j|j}t| j|j}t | j|j}||k s$||kr&dS || ||  }|S )a.  Returns area of the intersection of the bounding boxes of two PDFMiner objects.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    intersection_area : float
        Area of the intersection of the bounding boxes of both objects

            )maxr   minr~   rs   r   )r   r   x_lefty_topx_righty_bottomintersection_arear   r   r   r     s   r   c                 C   s   | j | j | j| j  S )zReturns area of the bounding box of a PDFMiner object.

    Parameters
    ----------
    bb : PDFMiner text object

    Returns
    -------
    area : float
        Area of the bounding box of the object

    rs   r   r~   r   )r   r   r   r   r     s   r   c                 C   s0   | j |jko|j | jko| j|jko|j| jkS )a   Returns True if the bounding boxes of two PDFMiner objects intersect.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    overlaps : bool
        True if the bounding boxes intersect

    r   r   r   r   r   r   r     s   0r   c                 C   s   | j | j |j |j kS )a3  Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    longer : bool
        True if the bounding box of the first object is longer or equal

    )rs   r   r   r   r   r   r     s   r   r   c                 C   sZ   g }| D ]&}|s| | q|d }tj|||dr%|| d }||d< q| | q|S )zMerges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.

    Parameters
    ----------
    ar : list
    line_tol : int, optional (default: 2)

    Returns
    -------
    ret : list

    )atolr   )r   npisclose)arrK   r(   atempr   r   r   merge_close_lines  s   
r   c                 C   s6   |s| S t jddtt j| dd| t jd}|S )a  Strips any characters in `strip` that are present in `text`.
    Parameters
    ----------
    text : str
        Text to process and strip.
    strip : str, optional (default: '')
        Characters that should be stripped from `text`.
    Returns
    -------
    stripped : str
    [r   ])flags)resubr6   mapescapeUNICODE)r   r   strippedr   r   r   
text_strip  s    r   c                 C   s  |dkrdd | D }n|dkrdd | D }dd |D }t t|dkrzg }t|}t|tdD ]@\}}||kr]dd |D }	d	|	 r\|	d
d |	d |d	|	 q3dd |D }	d	|	 rs|d	|	 q3d	|}
n
d	dd | D }
t	|
|S )a  Flags super/subscripts in text by enclosing them with <s></s>.
    May give false positives.

    Parameters
    ----------
    textline : list
        List of PDFMiner LTChar objects.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    fstring : string

    
horizontalc                 S   .   g | ]}t |ts| tj|jd dfqS r*   )decimals)
isinstancer   r   r   roundheightr   r   r   r   r         z"flag_font_size.<locals>.<listcomp>verticalc                 S   r   r   )r   r   r   r   r   widthr   r   r   r   r     r   c                 S   s   g | ]\}}t j|d dqS r   )r   r   )r   r   sizer   r   r   r   !  s    r    c                 S      g | ]}|d  qS r   r   r   r   r   r   r   '      r   r   z<s>z</s>c                 S   r   r   r   r   r   r   r   r   -  r   c                 S   s   g | ]}|  qS r   r   r   r   r   r   r   2  r   )
r   rT   r   r   r   r6   r   insertr   r   )textline	direction
strip_textdlflistmin_sizerd   r   fcharsfstringr   r   r   flag_font_size  s6   

r  Fc              
      s`  d}g }|j  zO|dkr| s fddtjD } fddtjD }|d fdd|D }	|	sF|d j d jfg}	|jD ]a}
j }|	D ]W}t|
t	r|d |
j
|
j d	   krm|d krn n|
j|
j d	 |d kr||d |
f  n#||	d kr||d d |
f qRt|
tr||d |
f qRqIn|d
krV| sV fddtjD } fddtjD }|d fdd|D }|s|d jd  jfg}|jD ]g}
j }|D ]]}t|
t	rD|d |
j|
j d	   kr|d kr0n n|
j
|
j d	 |d kr0||d |
f  n%||d krC||d d |
f qt|
trT||d |
f qqW n tyi   dd| fg Y S w g }t|tddD ]9\}}|r||d |d tdd |D ||df qtdd |D }||d |d td||f qt|S )a9  Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.

    Parameters
    ----------
    table : camelot.core.Table
    textline : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts.)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    grouped_chars : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.

    r   r   c                    4   g | ]\}}|d   d kr d  |d kr|qS r   r   )r   irw   r   r   r   r   U  
     z"split_textline.<locals>.<listcomp>c                    sD   g | ]\}}|d   d   d  d   kr|d krn n|qS )r    r   r   r   r   )r   r   rr  r   r   r   Z  
    0c                    s0   g | ]}j   | jr|j   | jfqS r   )cellsrightrt   )r   c)r  tabler   r   r   `  
    r   r    r   r   c                    r  )r    r   r   r   )r   r   yr  r   r   r   v  r  c                    sD   g | ]\}}|d   d   d  d   kr|d krn n|qS r   r   )r   r  r  r  r   r   r   {  r  c                    s0   g | ]}j |   jr|j |   jfqS r   )r	  bottomr~   )r   r  )r  r  r   r   r     r  c                 S   r   r   r   r   r   r   r   r     r   r   c                 S   s   g | ]}|d    qS r  r   r   r   r   r   r     s    r   )r   is_empty	enumeratecolsrowsr	  rt   _objsr   r   r   r~   r   rs   r   r   
IndexErrorr   r   r   r  r   r6   )r  r   r   	flag_sizer   idxcut_text	x_overlapr_idxx_cutsr?   rowcut	y_overlapc_idxy_cutscolgrouped_charsrd   r   gcharsr   )r   r  r  r  r   split_textline6  s   




,




0
r&  c              	   C   s
  dgd \}}t t| jD ]}|j|j d | j| d k r|j|j d | j| d krg }	| jD ]E}
|
d |jkrs|
d |jkrs|
d |jkrM|jn|
d }|
d |jkr[|jn|
d }|	t	|| t	|
d |
d    q3|	d q3tt
tdd |	dkr| d}|j|jf}| jd d | jd d f}t| d	| d
|  |}|	t|	} nqdgd \}}}}|j| j| d krt	|j| j| d  }|j| j| d k rt	|j| j| d  }|j| j| d k rt	|j| j| d  }|j| j| d krt	|j| j| d  }t	|j|j dkr*dnt	|j|j }t	|j|j dkr?dnt	|j|j }|| }|||  |||   | }|rgt| ||||d|fS |rx||t|j||dfg|fS ||t| |fg|fS )a}  Gets indices of the table cell where given text object lies by
    comparing their y and x-coordinates.

    Parameters
    ----------
    table : camelot.core.Table
    t : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    indices : list
        List of tuples of the form (r_idx, c_idx, text) where r_idx
        and c_idx are row and column indices.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+

    r   r   r   r   r    c                 S   s   | dkS )Nr   r   )rw   r   r   r   <lambda>  s    z!get_table_index.<locals>.<lambda>
 z does not lie in column range    r         ?)r  r   r  )ranger   r  r   r~   r  rs   r   r   rz   r   filterr   r   warningswarnindexr   r&  r  r  r   )r  r   r   
split_textr  r   r  r!  r  lt_col_overlapr  leftr
  r   
text_range	col_range	y0_offset	y1_offset	x0_offset	x1_offsetXYchareaerrorr   r   r   get_table_index  sj   '0
(**
r>  c                 C   s   d}z2d}t dd | D |krtd| D ]}|d t|d  }|d D ]
}||d|  7 }q&qW |S  ty@   d}Y |S w )aw  Calculates a score based on weights assigned to various
    parameters and their error percentages.

    Parameters
    ----------
    error_weights : list
        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
        where pn is the weight assigned to list of errors en.
        Sum of pn should be equal to 100.

    Returns
    -------
    score : float

    d   r   c                 S   r   r   r   )r   ewr   r   r   r   $  r   z$compute_accuracy.<locals>.<listcomp>z&Sum of weights should be equal to 100.r    )r   rW   r   ZeroDivisionError)error_weights	SCORE_VALscorer@  weighterror_percentager   r   r   compute_accuracy  s    rG  c                 C   sZ   d}g g }}| D ]}|D ]}|  dkr|d7 }qq	d|tt| t| d    }|S )zCalculates the percentage of empty strings in a
    two-dimensional list.

    Parameters
    ----------
    d : list

    Returns
    -------
    whitespace : float
        Percentage of empty cells.

    r   r   r    r?  )r   floatr   )r   
whitespacer_nempty_cellsc_nempty_cellsr  r   r   r   r   compute_whitespace/  s   
 rL        ?r+  皙?Tc              
   C   s   t | dV}t|}	t|	}
|
jstd|  t|||||||d}t }t||d}t||}t	
|
D ]}|| | }|jd }|jd }||f}q6||fW  d   S 1 s^w   Y  dS )aM  Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. To get the definitions of kwargs, see
    https://pdfminersix.rtfd.io/en/latest/reference/composable.html.

    Parameters
    ----------
    filename : string
        Path to pdf file.
    line_overlap : float
    char_margin : float
    line_margin : float
    word_margin : float
    boxes_flow : float
    detect_vertical : bool
    all_texts : bool

    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        Dimension of pdf page in the form (width, height).

    rbz Text extraction is not allowed: )line_overlapchar_marginline_marginword_margin
boxes_flowdetect_vertical	all_texts)laparamsr   r   N)openr   r   is_extractabler   r   r   r
   r	   r   create_pagesprocess_page
get_resultr   )r;   rP  rQ  rR  rS  rT  rU  rV  r<   parserdocumentrW  rsrcmgrdeviceinterpreterpagelayoutr   r   dimr   r   r   get_page_layoutG  s6   "	




$re  charc                 C   s   |dkrt }n|dkrt}n|dkrt}n|dkrt}|du r!g }z| jD ]}t||r2|| q%|t||d7 }q%W |S  tyG   Y |S w )a  Recursively parses pdf layout to get a list of
    PDFMiner text objects.

    Parameters
    ----------
    layout : object
        PDFMiner LTPage object.
    ltype : string
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
    t : list

    Returns
    -------
    t : list
        List of PDFMiner text objects.

    rf  imager   r   N)ltype)	r   r   r   r   r  r   r   get_text_objectsAttributeError)rc  rh  r   LTObjectr?   r   r   r   ri    s*   

ri  )rQ   r  )r   )Fr   )FFr   )rM  r+  rM  rN  rM  TT)rf  N)Kr4   r   r!   r9   r#   r-   r.  	itertoolsr   operatorr   numpyr   pdfminer.pdfparserr   pdfminer.pdfdocumentr   pdfminer.pdfpager   r   pdfminer.pdfinterpr   r	   pdfminer.converterr
   pdfminer.layoutr   r   r   r   r   r   urllib.requestr   r   urllib.parser   r   r   r   r   rT   r   r   r   r)   rB   r_   r`   rb   re   objectrf   ru   ry   r   r   r   r   r   rH  r   r   boolr   r   r   r   r  r&  r>  rG  rL  re  ri  r   r   r   r   <module>   sv    	



	!<%*



4w
e
>