o
    j?                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZ	ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZmZmZ ddlmZ ed	ZG d
d deZ dS )    N   )
BaseParser   )Table)scale_image	scale_pdfsegments_in_bboxtext_in_bboxmerge_close_linesget_table_indexcompute_accuracycompute_whitespace)adaptive_threshold
find_linesfind_contoursfind_joints)BACKENDScamelotc                   @   s   e Zd ZdZdddddddgddddddd	d
ddfddZedd Zedd ZedddZdd Z	dd Z
dd Zdi fddZdS )Latticea-  Lattice method of parsing looks for lines between text
    to parse the table.

    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    process_background : bool, optional (default: False)
        Process background lines.
    line_scale : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
    copy_text : list, optional (default: None)
        {'h', 'v'}
        Direction in which text in a spanning cell will be copied
        over.
    shift_text : list, optional (default: ['l', 't'])
        {'l', 'r', 't', 'b'}
        Direction in which text in a spanning cell will flow.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    line_tol : int, optional (default: 2)
        Tolerance parameter used to merge close vertical and horizontal
        lines.
    joint_tol : int, optional (default: 2)
        Tolerance parameter used to decide whether the detected lines
        and points lie close to each other.
    threshold_blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.

        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.

        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.

        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    resolution : int, optional (default: 300)
        Resolution used for PDF to PNG conversion.

    NF   lt r   r   i,  ghostscriptc                 K   sj   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _t|| _d S N)table_regionstable_areasprocess_background
line_scale	copy_text
shift_text
split_text	flag_size
strip_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolutionr   _get_backendbackend)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   kwargs r/   ]/var/www/html/fyndo/pharma/fyndo/venv/lib/python3.10/site-packages/camelot/parsers/lattice.py__init__a   s    zLattice.__init__c                    sj    fdd}t  tr( t vrtd  d dkr#tdt t   S | s3td  d S )	Nc                     s   dd t  D } d| v S )Nc                 S   s   g | ]}| d du r|qS )__F)
startswith).0methodr/   r/   r0   
<listcomp>   s    zDLattice._get_backend.<locals>.implements_convert.<locals>.<listcomp>convert)dir)methodsr,   r/   r0   implements_convert   s   z0Lattice._get_backend.<locals>.implements_convertzUnknown backend 'z:' specified. Please use either 'poppler' or 'ghostscript'.r   z'ghostscript' will be replaced by 'poppler' as the default image conversion backend in v0.12.0. You can try out 'poppler' with backend='poppler'.'z#' must implement a 'convert' method)
isinstancestrr   keysNotImplementedErrorwarningswarnDeprecationWarning)r,   r;   r/   r:   r0   r+      s"   



zLattice._get_backendc                 C   s2  g }|D ]\}}}|D ]}|dkr-| j | | jr-| j | | js-|d8 }| j | | jr!|dkrM| j | | jrM| j | | jsM|d7 }| j | | jrA|dkrm| j | | jrm| j | | jsm|d8 }| j | | jra|dkr| j | | jr| j | | js|d7 }| j | | jrq||||f q|S )a  Reduces index of a text object if it lies within a spanning
        cell.

        Parameters
        ----------
        table : camelot.core.Table
        idx : list
            List of tuples of the form (r_idx, c_idx, text).
        shift_text : list
            {'l', 'r', 't', 'b'}
            Select one or more strings from above and pass them as a
            list to specify where the text in a spanning cell should
            flow.

        Returns
        -------
        indices : list
            List of tuples of the form (r_idx, c_idx, text) where
            r_idx and c_idx are new row and column indices for text.

        r   r   rr   b)cellshspanleftrightvspantopbottomappend)r   idxr!   indicesr_idxc_idxtextdr/   r/   r0   _reduce_index   s4   zLattice._reduce_indexc                 C   s(  |D ]}|dkrKt t| jD ]:}t t| j| D ].}| j| | j dkrH| j| | jrH| j| | jsH| j| |d  j| j| | _qqq|dkrt t| jD ]:}t t| j| D ].}| j| | j dkr| j| | jr| j| | js| j|d  | j| j| | _qaqVq| S )a  Copies over text in empty spanning cells.

        Parameters
        ----------
        t : camelot.core.Table
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.

        Returns
        -------
        t : camelot.core.Table

        hr   r   v)	rangelenrF   rR   striprG   rH   rJ   rK   )r   r    fijr/   r/   r0   _copy_spanning_text   s(       zLattice._copy_spanning_textc                    sx   fdd}t | j| j| j| jd\| _| _| jjd }| jjd }|t| j	 }|t| j
 }| j	t| }| j
t| }||| j
f |||f}| jd u rd }	| jd urZ|| j}	t| j|	d| j| jd\}
}t| j|	d| j| jd\}}t|
|}t||
|}n%t| jd| j| jd	\}
}t| jd| j| jd	\}}|| j}t||
|}t|| _t||||\| _| _| _d S )
Nc              	      s   g }| D ]9}| d\}}}}t|}t|}t|}t|}t||||f \}}}}|||t|| t|| f q|S )N,)splitfloatr   rM   abs)areasscaled_areasareax1y1x2y2image_scalersr/   r0   scale_areas   s   $z1Lattice._generate_table_bbox.<locals>.scale_areas)r   	blocksizecr   r   vertical)regions	directionr   r)   
horizontal)rp   r   r)   )r   	imagenamer   r'   r(   image	thresholdshaper`   	pdf_width
pdf_heightr   r   r   r   r)   r   r   copydeepcopytable_bbox_unscaledr   
table_bboxvertical_segmentshorizontal_segments)r-   rk   image_widthimage_heightimage_width_scalerimage_height_scalerpdf_width_scalerpdf_height_scalerpdf_scalersro   vertical_maskr|   horizontal_maskr}   contoursr{   rb   r/   ri   r0   _generate_table_bbox   sj   









zLattice._generate_table_bboxc                    s*  i }t || j| j\}}t|| j|d< t|| j|d< |d jdd d |d jdd d || _t| j	|  \ t
 t
  |d |d g |d	 |d
 g tt | jd ttdd| jd fddtdt d	 D  fddtdtd	 D  ||fS )Nrq   rn   c                 S   s   | j  | jfS r   )y0x0xr/   r/   r0   <lambda>E      z4Lattice._generate_columns_and_rows.<locals>.<lambda>)keyc                 S   s   | j | j fS r   )r   r   r   r/   r/   r0   r   F  r   r   r   r      )r%   T)reversec                        g | ]} |  |d   fqS r   r/   r4   r[   )colsr/   r0   r6   R       z6Lattice._generate_columns_and_rows.<locals>.<listcomp>c                    r   r   r/   r   )rowsr/   r0   r6   S  r   )r   r|   r}   r	   horizontal_textvertical_textsortt_bboxzipr{   listextendr
   sortedr%   rW   rX   )r-   	table_idxtkr   v_sh_sr/   )r   r   r0   _generate_columns_and_rows<  s$   
  z"Lattice._generate_columns_and_rowsc              
   K   s  | d}| d}|d u s|d u rtd| jt||}|j||| jd}| }| }g }dD ]?}	| j	|	 D ]7}
t
||
|	| j| j| jd\}}|d d dkrr|| tj||| jd	}|D ]\}}}||j| | _qdq;q4td
|gg}| jd urtj|| jd}|j}t||_|jj|_t|}d|_||_||_|d |_ t!t"j#$| j%dd|_&g }|'dd | j(D  |'dd | j)D  ||_*| j+| j,f|_-| j.| j/f|_0d |_1|S )Nr   r   zNo segments found on {})r&   )rn   rq   )r"   r#   r$   r   )r   )r!   d   )r    latticer   zpage-r   c                 S       g | ]}|j |j|j|jfqS r/   r   r   re   rf   r4   r   r/   r/   r0   r6     r   z+Lattice._generate_table.<locals>.<listcomp>c                 S   r   r/   r   r   r/   r/   r0   r6     r   )2get
ValueErrorformatrootnamer   	set_edgesr&   
set_borderset_spanr   r   r"   r#   r$   rM   r   rT   r!   rF   rR   r   r    r]   datapd	DataFramedfru   r   flavoraccuracy
whitespaceorderintospathbasenamereplacepager   r   r   _textrs   rz   _imager|   r}   	_segments
_textedges)r-   r   r   r   r.   r   r   table
pos_errorsrp   r   rO   errorrP   rQ   rR   r   r   r   r   r/   r/   r0   _generate_tableW  s`   







zLattice._generate_tablec                 C   s   |  || |stdtj| j | js9| j	r*t
dtj| j g S t
dtj| j g S | j| j| j |   g }tt| j dd ddD ] \}}| ||\}}}	}
| j||||	|
d}||_|| qV|S )	NzProcessing {}z:{} is image-based, camelot only works on text-based pages.zNo tables found on {}c                 S   s   | d S )Nr   r/   r   r/   r/   r0   r     s    z(Lattice.extract_tables.<locals>.<lambda>T)r   r   )r   r   )_generate_layoutloggerinfor   r   r   r   r   r   imagesrA   rB   r,   r7   filenamerr   r   	enumerater   r{   r?   r   r   _bboxrM   )r-   r   suppress_stdoutlayout_kwargs_tablesr   r   r   r   r   r   r   r/   r/   r0   extract_tables  s2   zLattice.extract_tablesr   )__name__
__module____qualname____doc__r1   staticmethodr+   rT   r]   r   r   r   r   r/   r/   r/   r0   r   %   s:    =
%

, J=r   )!r   sysrx   localeloggingrA   numpynppandasr   baser   corer   utilsr   r   r   r	   r
   r   r   r   image_processingr   r   r   r   backends.image_conversionr   	getLoggerr   r   r/   r/   r/   r0   <module>   s   (

