
    A ji                     J   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZ d dl m!Z!m"Z" d dl#m$Z% d dl#m&Z&m'Z'm(Z(  e)e&e'z   e(z             Z*e*+                    d           d Z,d Z-d Z.g dZ/g dZ0d6dZ1d6dZ2 G d de3          Z4d Z5d Z6d Z7d Z8d Z9d Z:d  Z;d!e<fd"Z=d!e<fd#Z>d!e?fd$Z@d!e?fd%ZAd7d'ZBd8d(ZCd8d)ZDd9d+ZE	 d:d,ZFd- ZGd. ZH	 	 	 	 	 	 	 d;d3ZId<d5ZJdS )=    N)groupby)
itemgetter)	PDFParser)PDFDocument)PDFPage)PDFTextExtractionNotAllowed)PDFResourceManager)PDFPageInterpreter)PDFPageAggregator)LAParamsLTAnnoLTCharLTTextLineHorizontalLTTextLineVerticalLTImage)Requesturlopen)urlparse)uses_relativeuses_netlocuses_params c                 \    	 t          |           j        t          v S # t          $ r Y dS w xY w)zCheck to see if a URL has a valid protocol.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    isurl : bool
        If url has a valid protocol return True otherwise False.

    F)	parse_urlscheme_VALID_URLS	Exception)urls    aC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\venv\Lib\site-packages\camelot/utils.pyis_urlr    (   s=    ~~$33   uus    
++c                     d}| rB|t          j        t          j        t          j        z   t          j        z             z  }| dz  } | B|S )Nr      )randomchoicestringdigitsascii_lowercaseascii_uppercase)lengthrets     r   random_stringr+   ;   sX    
C
 v}MF22V5KK
 
 	
 	!	  
 J    c                 J   t          d           d}t          j        dd          5 }ddi}t          | d|          }t	          |          }|                                                                }|d	k    rt          d
          |                    |	                                           ddd           n# 1 swxY w Y   t          j                            t          j                            |j                  |          }t          j        |j        |           |S )zDownload file from specified URL.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    filepath : str or unicode
        Temporary filepath.

       z.pdfwbF)deletez
User-AgentzMozilla/5.0Nzapplication/pdfzFile format not supported)r+   tempfileNamedTemporaryFiler   r   infoget_content_typeNotImplementedErrorwritereadospathjoindirnamenameshutilmove)r   filenamefheadersrequestobjcontent_typefilepaths           r   download_urlrF   E   s;     ""(((H		$T%	8	8	8 A/#tW--gxxzz2244,, 	C%&ABBB	

               w||BGOOAF33X>>H
K!!!Os   BB<<C C )columnsedge_tolrow_tol
column_tol)
process_background
line_scale	copy_text
shift_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolutionlatticec                 f    fd}dk    r |t           |            d S  |t          |            d S )Nc                     t          |                               t          |                                                    }|r5t          d                    t          |                     d d          d S )N,z cannot be used with flavor='')setintersectionkeys
ValueErrorr:   sorted)parser_kwargsinput_kwargsisecflavors      r   check_intersectionz*validate_input.<locals>.check_intersectionp   s|    =!!..s<3D3D3F3F/G/GHH 	88F4LL))QQQQQ  	 	r,   rU   )stream_kwargslattice_kwargs)kwargsrb   rc   s    ` r   validate_inputrg   o   s]          3=&11111>622222r,   c                     |dk    r6|                                  D ] }|t          v r|                     |           !n5|                                  D ] }|t          v r|                     |           !| S )NrU   )r\   rd   popre   )rf   rb   keys      r   remove_extrark   }   s      ;;== 	  	 Cm#  

3	  ;;== 	  	 Cn$  

3Mr,   c                       e Zd Zd Zd ZdS )TemporaryDirectoryc                 @    t          j                    | _        | j        S N)r1   mkdtempr<   )selfs    r   	__enter__zTemporaryDirectory.__enter__   s    $&&	yr,   c                 8    t          j        | j                   d S ro   )r=   rmtreer<   )rq   exc_type	exc_value	tracebacks       r   __exit__zTemporaryDirectory.__exit__   s    di     r,   N)__name__
__module____qualname__rr   rx    r,   r   rm   rm      s2          ! ! ! ! !r,   rm   c                     || z  }|S )zTranslates x2 by x1.

    Parameters
    ----------
    x1 : float
    x2 : float

    Returns
    -------
    x2 : float

    r|   )x1x2s     r   	translater      s     "HBIr,   c                     | |z  } | S )zScales x by scaling factor s.

    Parameters
    ----------
    x : float
    s : float

    Returns
    -------
    x : float

    r|   )xss     r   scaler      s     FAHr,   c                    | \  }}}}|\  }}}t          ||          }t          t          t          | |                    |          }t          ||          }t          t          t          | |                    |          }t          |          t          |          t          |          t          |          f}	|	S )ay  Translates and scales pdf coordinate space to image
    coordinate space.

    Parameters
    ----------
    k : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
        space.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
        first two elements are scaling factors and pdf_y is height of
        pdf.

    Returns
    -------
    knew : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
        space.

    )r   absr   int)
kfactorsr~   y1r   y2scaling_factor_xscaling_factor_ypdf_yknews
             r   	scale_pdfr      s    . NBB07-&	r#	$	$B	s9eVR(())+;	<	<B	r#	$	$B	s9eVR(())+;	<	<BGGSWWc"ggs2ww/DKr,   c                    |\  i }|                                  D ]}|\  }}}}	t          |          }t          t          t           |                              }t          |          }t          t          t           |	                              }	t	          | |          \  }
}fd|
D             }
fd|D             }t	          |
|          }||||||	f<   g }|D ]}t          |d                   t          |d                   }}t          t          t           |d                                       t          t          t           |d                                       }	}|                    ||||	f           g }|D ]}t          |d                   t          |d                   }}t          t          t           |d                                       t          t          t           |d                                       }	}|                    ||||	f           |||fS )av  Translates and scales image coordinate space to pdf
    coordinate space.

    Parameters
    ----------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as value.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of horizontal line segments.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
        first two elements are scaling factors and img_y is height of
        image.

    Returns
    -------
    tables_new : dict
    v_segments_new : dict
    h_segments_new : dict

    c                 0    g | ]}t          |          S r|   )r   ).0jr   s     r   
<listcomp>zscale_image.<locals>.<listcomp>   s$    777auQ())777r,   c           
      h    g | ].}t          t          t           |                              /S r|   )r   r   r   )r   r   img_yr   s     r   r   zscale_image.<locals>.<listcomp>   s8    OOOauSE61--..0@AAOOOr,   r      r"      )r\   r   r   r   zipappend)tables
v_segments
h_segmentsr   
tables_newr   r~   r   r   r   j_xj_yjointsv_segments_newvh_segments_newhr   r   r   s                    @@@r   scale_imager      s|   2 18-&J[[]] 
. 
.BB2'((3y%,,--/?@@2'((3y%,,--/?@@q	?S77773777OOOOO3OOOS#'-
BB#$$N 0 0qt-..ad<L0M0MB#i!--..0@AA#i!--..0@AA  	r2r2.////N 0 0qt-..ad<L0M0MB#i!--..0@AA#i!--..0@AA  	r2r2.////~~55r,   c                     d}t          d |D                       }t          d |D                       }||k     r<t          d | D                       }t          d | D                       }||k     rdnd}|S )aC  Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.

    Parameters
    ----------
    horizontal_text : list
        List of PDFMiner LTTextLineHorizontal objects.
    vertical_text : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.

    Returns
    -------
    rotation : string
        '' if text in table is upright, 'anticlockwise' if
        rotated 90 degree anticlockwise and 'clockwise' if
        rotated 90 degree clockwise.

    r   c                 ^    g | ]*}|                                                                 (|+S r|   get_textstripr   ts     r   r   z get_rotation.<locals>.<listcomp>(  s1    CCCaajjll.@.@.B.BCCCCr,   c                 ^    g | ]*}|                                                                 (|+S r|   r   r   s     r   r   z get_rotation.<locals>.<listcomp>)  s1    AAAaAJJLL,>,>,@,@AAAAr,   c              3   Z   K   | ]&}|j         d          dk     o|j         d         dk    V  'dS r"   r   r   Nmatrixr   s     r   	<genexpr>zget_rotation.<locals>.<genexpr>+  s;      KKa;AHQK!OKKKKKKr,   c              3   Z   K   | ]&}|j         d          dk    o|j         d         dk     V  'dS r   r   r   s     r   r   zget_rotation.<locals>.<genexpr>,  s;      OOAAHQK!O?aOOOOOOr,   anticlockwise	clockwise)lensum)charshorizontal_textvertical_textrotationhlenvlenr   r   s           r   get_rotationr     s    * HCC?CCCDDDAA=AAABBDd{ QKKUKKKKK	OOOOOOO&/-&?P??[Or,   c                     | d         | d         f| d         | d         ffd|D             }fd|D             }||fS )a6  Returns all line segments present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
        space.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of vertical horizontal segments.

    Returns
    -------
    v_s : list
        List of vertical line segments that lie inside table.
    h_s : list
        List of horizontal line segments that lie inside table.

    r   r"   r   r   c                     g | ]V}|d          d          dz
  k    |d         d          dz   k     ,d         dz
  |d         cxk    rd         dz   k    Qn n|WS )r"   r   r   r   r|   )r   r   lbrts     r   r   z$segments_in_bbox.<locals>.<listcomp>I         Q4"Q%!) !"!r!uqy 0 68UQY!A$    KMQ%RS)    	  r,   c                     g | ]V}|d          d          dz
  k    |d         d          dz   k     ,d         dz
  |d         cxk    rd         dz   k    Qn n|WS r   r   r"   r|   )r   r   r   r   s     r   r   z$segments_in_bbox.<locals>.<listcomp>N  r   r,   r|   )bboxr   r   v_sh_sr   r   s        @@r   segments_in_bboxr   1  s    , q'47	B
q'47	B      C
      C
 8Or,   c                    | d         | d         f| d         | d         ffd|D             }d |D             }|D ]y}|                                 D ]b}||k    r	t          ||          rIt          ||          t          |          z  dk    r%t	          ||          r|                    |           czt          |          }|S )a  Returns all text objects present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.

    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects that lie inside table, discarding the overlapping ones

    r   r"   r   r   c                     g | ]h}d          dz
  |j         |j        z   dz  cxk    rd          dz   k    1n n4d         dz
  |j        |j        z   dz  cxk    rd         dz   k    cn n|iS )r   r          @r"   )x0r~   y0r   )r   r   r   r   s     r   r   z text_in_bbox.<locals>.<listcomp>i  s       a519+    02!uqy     qEAI!$+,	    131		    	  r,   c                     h | ]}|S r|   r|   r   s     r   	<setcomp>ztext_in_bbox.<locals>.<setcomp>q  s    !Ar,   g?)copybbox_intersectbbox_intersection_area	bbox_areabbox_longerdiscardlist)	r   textt_bboxrestbabbunique_boxesr   r   s	          @@r   text_in_bboxr   V  s   " q'47	B
q'47	B      F vD ) )))++ 	) 	)BRx b"%% )*2r22Yr]]BcI )"2r** )R(((	) ::Lr,   returnc                    t          | j        |j                  }t          | j        |j                  }t          | j        |j                  }t          | j        |j                  }||k     s||k    rdS ||z
  ||z
  z  }|S )a.  Returns area of the intersection of the bounding boxes of two PDFMiner objects.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    intersection_area : float
        Area of the intersection of the bounding boxes of both objects

            )maxr   minr   r~   r   )r   r   x_lefty_topx_righty_bottomintersection_areas          r   r   r     s     FruE"%G25"%  H 8e+ s 6)eh.>?r,   c                 @    | j         | j        z
  | j        | j        z
  z  S )zReturns area of the bounding box of a PDFMiner object.

    Parameters
    ----------
    bb : PDFMiner text object

    Returns
    -------
    area : float
        Area of the bounding box of the object

    r~   r   r   r   )r   s    r   r   r     s     EBEMbebem,,r,   c                     | j         |j        k    o/|j         | j        k    o| j        |j        k    o|j        | j        k    S )a   Returns True if the bounding boxes of two PDFMiner objects intersect.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    overlaps : bool
        True if the bounding boxes intersect

    r   r   r   s     r   r   r     s9     5BE>RberunR"%RBERUNRr,   c                 B    | j         | j        z
  |j         |j        z
  k    S )a3  Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    longer : bool
        True if the bounding box of the first object is longer or equal

    )r~   r   r   s     r   r   r     s     EBEMruru}--r,   r   c                     g }| D ]\}|s|                     |           |d         }t          j        |||          r||z   dz  }||d<   G|                     |           ]|S )zMerges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.

    Parameters
    ----------
    ar : list
    line_tol : int, optional (default: 2)

    Returns
    -------
    ret : list

    )atolr   )r   npisclose)arrO   r*   atemps        r   merge_close_linesr     s     C 	 	 	JJqMMMMr7Dz$111 qC'B

1Jr,   c           	          |s| S t          j        dd                    t          t           j        |                     dd| t           j                  }|S )a  Strips any characters in `strip` that are present in `text`.
    Parameters
    ----------
    text : str
        Text to process and strip.
    strip : str, optional (default: '')
        Characters that should be stripped from `text`.
    Returns
    -------
    stripped : str
    [r   ])flags)resubr:   mapescapeUNICODE)r   r   strippeds      r   
text_stripr    s\      v.RWWSE**++...D
  H Or,   c                 h   |dk    rd | D             }n|dk    rd | D             }d |D             }t          t          |                    dk    r2g }t          |          }t          |t	          d                    D ]\  }}||k    rd |D             }	d                    |	                                          rS|	                    d	d
           |	                    d           |                    d                    |	                     d |D             }	d                    |	                                          r(|                    d                    |	                     d                    |          }
nd                    d | D                       }
t          |
|          S )a  Flags super/subscripts in text by enclosing them with <s></s>.
    May give false positives.

    Parameters
    ----------
    textline : list
        List of PDFMiner LTChar objects.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    fstring : string

    
horizontalc                     g | ]F}t          |t                    |                                t          j        |j        d           fGS r.   )decimals)
isinstancer   r   r   roundheightr   s     r   r   z"flag_font_size.<locals>.<listcomp>  sV     
 
 
a((
ZZ\\28AHq999:
 
 
r,   verticalc                     g | ]F}t          |t                    |                                t          j        |j        d           fGS r  )r	  r   r   r   r
  widthr   s     r   r   z"flag_font_size.<locals>.<listcomp>  sV     
 
 
a((
ZZ\\28AGa8889
 
 
r,   c                 @    g | ]\  }}t          j        |d           S r  )r   r
  )r   r   sizes      r   r   z"flag_font_size.<locals>.<listcomp>!  s+    888
d$	#	#	#888r,   r"   c                     g | ]
}|d          S r   r|   r   s     r   r   z"flag_font_size.<locals>.<listcomp>'      ...1!A$...r,   r   r   z<s>z</s>c                     g | ]
}|d          S r  r|   r   s     r   r   z"flag_font_size.<locals>.<listcomp>-  r  r,   c                 6    g | ]}|                                 S r|   r   r   s     r   r   z"flag_font_size.<locals>.<listcomp>2  s     :::A1::<<:::r,   )
r   rZ   r   r   r   r:   r   insertr   r  )textline	direction
strip_textdlflistmin_sizerj   r   fcharsfstrings              r   flag_font_sizer!    s   & L  

 

 
 

 
j	  

 

 
 

 	98a888A
3q66{{Q <q66!!Z]]33 
	2 
	2JCh 	2.....776??((** 2MM!U+++MM&)))LL111.....776??((** 2LL111''%..'':::::;;gz***r,   Fc           
      ,    d}g }|j         	 |dk    r|                                svfdt           j                  D             }fdt           j                  D             }|d          fd|D             }	|	s!|d          j                 d         j        fg}	|j        D ]}
 j                 }|	D ]}t          |
t                    r|d         |
j
        |
j        z   dz  cxk    r|d         k    r?n n<|
j        |
j        z   dz  |d         k    r |                    |d         |
f            nb||	d         k    r!|                    |d         dz   |
f           t          |
t                    r|                    |d         |
f           n|d	k    r|                                stfd
t           j                  D             }fdt           j                  D             }|d          fd|D             }|s!|d          j        d                  j        fg}|j        D ]}
 j                 }|D ]}t          |
t                    r|d         |
j        |
j        z   dz  cxk    r|d         k    r?n n<|
j
        |
j        z   dz  |d         k    r |                    |d         |
f            nb||d         k    r!|                    |d         dz
  |
f           t          |
t                    r|                    |d         |
f           n(# t           $ r dd|                                fgcY S w xY wg }t%          |t'          dd                    D ]\  }}|r?|                    |d         |d         t)          d |D             ||          f           Fd |D             }|                    |d         |d         t+          d                    |          |          f           |S )a9  Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.

    Parameters
    ----------
    table : camelot.core.Table
    textline : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts.)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    grouped_chars : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.

    r   r  c                 b    g | ]+\  }}|d          d         k    d          |d         k    )|,S r   r|   )r   ir   r   s      r   r   z"split_textline.<locals>.<listcomp>U  U       AqQ447? (,Aw!A$  r,   c                 v    g | ]5\  }}|d          d          d         z   dz  cxk    r|d         k    0n n|6S )r"   r   r   r   r|   )r   r   rr   s      r   r   z"split_textline.<locals>.<listcomp>Z  |       AqQ4DGd1g-2    78d      r,   c                 t    g | ]4}j                  |         j        |j                  |         j        f5S r|   )cellsrightr   )r   cr'  tables     r   r   z"split_textline.<locals>.<listcomp>`  sO       ./Aq@Q@WEKN1%()  r,   r   r"   r   r  c                 b    g | ]+\  }}|d          d         k    d          |d         k    )|,S )r"   r   r   r|   )r   r   yr   s      r   r   z"split_textline.<locals>.<listcomp>v  r%  r,   c                 v    g | ]5\  }}|d          d          d         z   dz  cxk    r|d         k    0n n|6S r   r|   )r   r$  r,  r   s      r   r   z"split_textline.<locals>.<listcomp>{  r(  r,   c                 t    g | ]4}j         |                  j        |j         |                  j        f5S r|   )r*  bottomr   )r   r'  r,  r-  s     r   r   z"split_textline.<locals>.<listcomp>  sO       ./Aq@Q@XEKN1%()  r,   c                     g | ]
}|d          S r   r|   r   s     r   r   z"split_textline.<locals>.<listcomp>  s    ---!1---r,   r  c                 B    g | ]}|d                                           S r4  r  r   s     r   r   z"split_textline.<locals>.<listcomp>  s$    555!admmoo555r,   r   )r   is_empty	enumeratecolsrowsr*  r   _objsr	  r   r   r   r   r~   r   r   
IndexErrorr   r   r   r!  r  r:   )r-  r  r  	flag_sizer  idxcut_text	x_overlapr_idxx_cutsrC   rowcut	y_overlapc_idxy_cutscolgrouped_charsrj   r   gcharsr   r,  r'  s   `                    @@@r   split_textlinerK  6  s   4 CH=DD/$ A	:X->->-@-@ A	:   %ej11  I
   %ej11  E
 aA    3<  F  A$Q<Q);)>?@~ : :jm! : :C!#v.. :Fsv!&;	F 	F 	F 	F?B1v	F 	F 	F 	F 	F!$#&A 5Q ?	F %OOQA,<===!E  #fRj0 F (CFQJ0D E E E#C00 : CFC(8999:  *$  	:X->->-@-@  	:   %ej11  I
   %ej11  E
 aA    3<  F  A$Q<R);)>?@~ : :jm! : :C!#v.. :Fsv!&;	F 	F 	F 	F?B1v	F 	F 	F 	F 	F!$#&A 5Q ?	F %OOSVQ,<===!E  #fRj0 F (Q!Q0D E E E#C00 : QC(8999 / / /R**,,-..../Mh
1a(8(899  
U 	  FF"--u---yZ      65u555F  QQBGGFOOZ!H!HI    s   L"L4 4"MMc           
      F   dgdz  \  }}t          t          | j                            D ]}|j        |j        z   dz  | j        |         d         k     r|j        |j        z   dz  | j        |         d         k    rg }	| j        D ]}
|
d         |j        k    r|
d         |j        k    r|
d         |j        k    r|j        n|
d         }|
d         |j        k    r|j        n|
d         }|	                    t          ||z
            t          |
d         |
d         z
            z             |	                    d           t          t          t          d |	                              dk    rw|                                                    d          }|j        |j        f}| j        d         d         | j        d         d         f}t          j        | d| d	|            |}|	                    t#          |	                    } ndgd
z  \  }}}}|j        | j        |         d         k    r(t          |j        | j        |         d         z
            }|j        | j        |         d         k     r(t          |j        | j        |         d         z
            }|j        | j        |         d         k     r(t          |j        | j        |         d         z
            }|j        | j        |         d         k    r(t          |j        | j        |         d         z
            }t          |j        |j        z
            dk    rdnt          |j        |j        z
            }t          |j        |j        z
            dk    rdnt          |j        |j        z
            }||z  }|||z   z  |||z   z  z   |z  }|rt%          | ||||          |fS |r||t'          |j        ||          fg|fS ||t+          |                                |          fg|fS )a}  Gets indices of the table cell where given text object lies by
    comparing their y and x-coordinates.

    Parameters
    ----------
    table : camelot.core.Table
    t : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    indices : list
        List of tuples of the form (r_idx, c_idx, text) where r_idx
        and c_idx are row and column indices.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+

    r   r   r   r   r"   c                     | dk    S )Nr   r|   )r   s    r   <lambda>z!get_table_index.<locals>.<lambda>  s
    b r,   
 z does not lie in column range    r         ?)r=  r  r5  )ranger   r:  r   r   r9  r~   r   r   r   r   filterr   r   warningswarnindexr   rK  r!  r;  r  )r-  r   r  
split_textr=  r  rA  rF  r'  lt_col_overlapr,  leftr+  r   
text_range	col_range	y0_offset	y1_offset	x0_offset	x1_offsetXYchareaerrors                           r   get_table_indexre    sK   N 4!8LE53uz??##  D14K3Aq!11 	qtad{c6IEJM

M 7 	  NZ . .Q414< .AaDADL .#$Q414<9144QqTD$%aDADL:ADDadE"))#dUl*;*;c!A$1+>N>N*NOOOO"))"----400.AABBCCqH zz||))$//dAD\
"Z]1-uz"~a/@A	SSjSS	SS   E"((^)<)<==EE 34q.Iy)Ytej"" 5uz%03344	tej"" 5uz%03344	tej"" 5uz%03344	tej"" 5uz%03344	14!$;3&<Cqt,<,<A14!$;3&<Cqt,<,<AUF9y()a9y3H.IJfTE Qq)yZ   	
 	
  	Q &qw	jQQQ 	 	 E:ajjllJ#G#GHI5PPr,   c                     d}	 d}t          d | D                       |k    rt          d          | D ]6}|d         t          |d                   z  }|d         D ]}||d|z
  z  z  }7n# t          $ r d}Y nw xY w|S )aw  Calculates a score based on weights assigned to various
    parameters and their error percentages.

    Parameters
    ----------
    error_weights : list
        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
        where pn is the weight assigned to list of errors en.
        Sum of pn should be equal to 100.

    Returns
    -------
    score : float

    d   r   c                     g | ]
}|d          S r  r|   )r   ews     r   r   z$compute_accuracy.<locals>.<listcomp>$  s    ..."1...r,   z&Sum of weights should be equal to 100.r"   )r   r]   r   ZeroDivisionError)error_weights	SCORE_VALscoreri  weighterror_percentages         r   compute_accuracyrp    s      I	.....//9< 	GEFFF 	9 	9BUSAZZ'F$&qE 9 9 1'7#7889	9    Ls   A'A, ,A;:A;c           	          d}g g }}| D ]$}|D ]}|                                 dk    r|dz  } %d|t          t          |           t          | d                   z            z  z  }|S )zCalculates the percentage of empty strings in a
    two-dimensional list.

    Parameters
    ----------
    d : list

    Returns
    -------
    whitespace : float
        Percentage of empty cells.

    r   r   r"   rg  )r   floatr   )r  
whitespacer_nempty_cellsc_nempty_cellsr$  r   s         r   compute_whitespacerv  /  s     J%'NN     	  	 AwwyyB  a
	  
U3q66C!II+=%>%>>?Jr,         ?rR  皙?Tc           
         t          | d          5 }t          |          }	t          |	          }
|
j        st	          d|            t          |||||||          }t                      }t          ||          }t          ||          }t          j
        |
          D ]I}|                    |           |                                }|j        d         }|j        d         }||f}J||fcddd           S # 1 swxY w Y   dS )aM  Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. To get the definitions of kwargs, see
    https://pdfminersix.rtfd.io/en/latest/reference/composable.html.

    Parameters
    ----------
    filename : string
        Path to pdf file.
    line_overlap : float
    char_margin : float
    line_margin : float
    word_margin : float
    boxes_flow : float
    detect_vertical : bool
    all_texts : bool

    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        Dimension of pdf page in the form (width, height).

    rbz Text extraction is not allowed: )line_overlapchar_marginline_marginword_margin
boxes_flowdetect_vertical	all_texts)laparamsr   r   N)openr   r   is_extractabler   r   r	   r   r
   r   create_pagesprocess_page
get_resultr   )r?   r{  r|  r}  r~  r  r  r  r@   parserdocumentr  rsrcmgrdeviceinterpreterpagelayoutr  r  dims                       r   get_page_layoutr  G  sv   D 
h		 1v&&& 	-=8==   %###!+
 
 
 %&&"7X>>>(&99(22 	" 	"D$$T***&&((FKNE[^F&/CCs{3                 s   CC<<D D charc                 (   |dk    rt           }n)|dk    rt          }n|dk    rt          }n|dk    rt          }|g }	 | j        D ]<}t          ||          r|                    |           (|t          ||          z  }=n# t          $ r Y nw xY w|S )a  Recursively parses pdf layout to get a list of
    PDFMiner text objects.

    Parameters
    ----------
    layout : object
        PDFMiner LTPage object.
    ltype : string
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
    t : list

    Returns
    -------
    t : list
        List of PDFMiner text objects.

    r  imager   r   N)ltype)	r   r   r   r   r;  r	  r   get_text_objectsAttributeError)r  r  r   LTObjectrC   s        r   r  r    s    &  &	'	 &	#	# &'	/	! &% < 	8 	8C#x(( 8%c7777		8
    Hs   AB 
BB)rU   r4  )r   )Fr   )FFr   )rw  rR  rw  rx  rw  TT)r  N)Kr8   r   r#   r=   r%   r1   rU  	itertoolsr   operatorr   numpyr   pdfminer.pdfparserr   pdfminer.pdfdocumentr   pdfminer.pdfpager   r   pdfminer.pdfinterpr	   r
   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   urllib.requestr   r   urllib.parser   r   r   r   r   rZ   r   r   r    r+   rF   rd   re   rg   rk   objectrm   r   r   r   r   r   r   r   rr  r   r   boolr   r   r   r  r!  rK  re  rp  rv  r  r  r|   r,   r   <module>r     s   
			 				                      ( ( ( ( ( ( , , , , , , $ $ $ $ $ $ 8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0                , + + + + + + + . . . . . . @ @ @ @ @ @ @ @ @ @ c-+-;<<   B     &    6 A@@  3 3 3 3	 	 	 	! ! ! ! ! ! ! !  "  "  B96 96 96x  >" " "J' ' 'Te    4-U - - - - Sd S S S S".4 . . . ."   8   41+ 1+ 1+ 1+hs s s sn HJbQ bQ bQ bQJ  <  4 ; ; ; ;|% % % % % %r,   