
    n j?4                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ  ed            e e j        d          	          Zd
ZdedefdZdedefdZdedefdZdedefdZdededefdZdS )    N)Path)OpenAI)read_file_content)load_dotenvT)overrideOPENAI_API_KEY)api_keyu{  You are a pharma data extraction engine.

Extract EVERY field available from this document. Do not skip any field.

Return JSON only with this structure:
{
  "filename": string,
  "document_type_code": one of ["SOP","BMR","EQP","LIMS","MAINT","DEV","UNKNOWN"],
  "batch_id": string or null,
  "product_name": string or null,
  "plant": string or null,
  "manufacturing_date": string or null,
  "parameters": [
    {
      "parameter_name": string,
      "observed_value": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "process_step": string or null,
      "equipment_id": string or null,
      "timestamp": string or null,
      "operator_id": string or null,
      "status": one of ["Normal", "DEVIATION", "PASS", "FAIL", "Missing"],
      "notes": string or null,
      "source_reference": string or null,
      "confidence": number 0-100
    }
  ],
  "lims_results": [
    {
      "sample_id": string or null,
      "test_name": string or null,
      "result_value": string or null,
      "specification_low": string or null,
      "specification_high": string or null,
      "unit": string or null,
      "status": string or null,
      "test_date": string or null,
      "analyst_id": string or null,
      "instrument_id": string or null,
      "notes": string or null
    }
  ],
  "equipment_readings": [
    {
      "timestamp": string or null,
      "equipment_id": string or null,
      "parameter": string or null,
      "value": string or null,
      "unit": string or null,
      "setpoint": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "status": string or null,
      "operator_id": string or null,
      "notes": string or null
    }
  ],
  "maintenance_records": [
    {
      "equipment_id": string or null,
      "maintenance_date": string or null,
      "maintenance_type": string or null,
      "performed_by": string or null,
      "duration_hours": string or null,
      "observation": string or null,
      "action_taken": string or null,
      "next_pm_due": string or null,
      "work_order_id": string or null,
      "sign_off_by": string or null,
      "status": string or null
    }
  ],
  "deviation_details": {
    "deviation_id": string or null,
    "severity": string or null,
    "description": string or null,
    "reported_by": string or null,
    "date_of_occurrence": string or null,
    "time_of_occurrence": string or null,
    "immediate_actions": array of strings,
    "equipment_id": string or null,
    "calibration_status": string or null,
    "batch_disposition": string or null,
    "root_cause_preliminary": string or null,
    "linked_records": object or null
  },
  "sop_limits": [
    {
      "process_step": string or null,
      "equipment_id": string or null,
      "parameter_name": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "criticality": string or null,
      "action_if_exceeded": string or null
    }
  ],
  "metadata": {
    "total_parameters": number,
    "total_lims_results": number,
    "total_equipment_readings": number,
    "total_maintenance_records": number,
    "deviations_found": number,
    "missing_values": number,
    "low_confidence_count": number,
    "operators_found": array of strings,
    "equipment_found": array of strings,
    "extraction_method": string
  }
}

Field extraction rules:
- BMR file: fill parameters[] — each row has batch_id, step_name, equipment_id, timestamp, parameter_name, observed_value, lower_limit, upper_limit, unit, status
- Equipment Log file: fill equipment_readings[] — each row has timestamp, equipment_id, parameter, value, unit, setpoint, lower_limit, upper_limit, status, operator_id, notes
- LIMS file: fill lims_results[] — each row has sample_id, batch_id, test_name, result_value, specification_low, specification_high, unit, status, test_date, analyst_id, instrument_id, notes
- Maintenance file: fill maintenance_records[] — each row has equipment_id, maintenance_date, maintenance_type, performed_by, observation, action_taken, next_pm_due, work_order_id, sign_off_by, status
- Deviation JSON: fill deviation_details{} with all fields
- SOP PDF: fill sop_limits[] with all CPP rows

CRITICAL: Never leave operator_id null if it exists in the source data.
CRITICAL: Never leave status null if it exists in the source data.
CRITICAL: Extract ALL rows — do not summarize or skip rows.
CRITICAL: If this is a scanned/handwritten document — read every visible field carefully including stamps, signatures, handwritten values.

Document filename: <filename>
Document content:
<content>
	file_pathreturnc                    	 t          j        t          |                     }d}|D ]+}||                                                                z  },|                                 t          |          dk     S # t          $ r Y dS w xY w)uN   Returns True if PDF has no extractable text — meaning it is a scanned image. 2   F)fitzopenstrget_textstripcloselen	Exception)r
   doc
total_textpages       DC:\Users\Terasoftware\OneDrive\Desktop\faahhh\fyndo\fyndo\extract.pyis_scanned_pdfr      s    	iI''
 	2 	2D$--////111JJ		:##   uus   A7A: :
BBc                 8    | j                                         dv S )z!Returns True if file is an image.)z.pngz.jpgz.jpegz.tiffz.tifz.bmpz.webp)suffixlower)r
   s    r   is_image_filer      s    !!##'bbb    c                    t          d| j                    g }t          |           rt          | d          5 }|                                }ddd           n# 1 swxY w Y   t          j        |                                          }| j        	                                
                    dd          }|dk    rd}|                    |d| d	           nt          j        t          |                     }t          t          t!          |          d
                    D ]s}||         }|                    d          }	|	                    d          }t          j        |                                          }|                    |dd	           t|                                 |st)          d          t!          |          }
t          d|
 d           dd|
 d|
 d|
 d| j         dt*          
                    d| j                  
                    dd           
dg}|D ]/}|                    dd|d          d|d           d!d"d#           0t,          j        j                            d$d%d&d'd(d)|d(gd*d+id,-          }|j        d,         j        j        }t;          j        |          }|                    d.i           |d.<   d/|d.         d0<   |S )1zg
    Converts scanned PDF pages or images to base64
    and sends to GPT-4o Vision for extraction.
    z0[EXTRACT] Using GPT-4o Vision for scanned file: rbN.r   jpgjpegzimage/)b64mimer   i,  )dpipngz	image/pngzNo images extracted from filez[EXTRACT] Sending z pages to GPT-4o VisiontextzdYou are a pharma data extraction engine specializing in scanned pharma documents.
This document has z pages. Extract data from ALL pages.

CRITICAL INSTRUCTIONS for scanned/handwritten documents:
- Read EVERY cell in every table row across ALL u-   pages — do not skip any row
- CRITICAL: Look for rows with status DEVIATION — these are the most important rows
- CRITICAL: Look for temperature readings above 70°C — these indicate deviations
- For unclear or partially visible text — make your best interpretation based on context
- For handwritten numbers — read carefully, common values are temperatures (60-80°C), weights (kg), percentages
- Never write "(unclear)" — always provide your best reading of the value
- If a value is truly unreadable — use null, not "(unclear)"
- Read ALL ue   pages — extract data from every page not just the first
- For rotated or sideways text — rotate mentally and read it
- For faded text — use context clues from surrounding cells to interpret
- Batch IDs follow pattern like BTC0048, equipment IDs like EQP_GRAN_01
- Status values are typically: Normal, DEVIATION, Alert, Pass, Fail

Document filename: z


<filename>	<content>u@   [See scanned document images above — read every visible field])typer*   	image_urlzdata:r'   z;base64,r&   high)urldetail)r-   r.   gpt-4oip  systemzYou are a pharma data extraction engine. Extract every visible field from scanned documents including handwritten data. Return JSON only.rolecontentuserr-   json_objectr   )model
max_tokensmessagesresponse_formattemperaturemetadatagpt4o_vision_ocrextraction_method) printnamer   r   readbase64	b64encodedecoder   r   replaceappendr   r   rangeminr   
get_pixmaptobytesr   
ValueErrorEXTRACT_PROMPTclientchatcompletionscreatechoicesmessager6   jsonloadsget)r
   
images_b64f	img_bytesr&   extr   page_numr   pixtotal_pagesr6   imgresponserawresults                   r   extract_with_visionrc      s   
 

MY^
M
MNNNJY )T"" 	!aI	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!y))0022$$&&..sB77%< 	C#~~~>>???? iI''c#c((B//00 	A 	AHx=D//c/**CE**I"9--4466Cc;??@@@@		 :8999j//K	
C{
C
C
CDDD
 SS S 2=	S S S S" N#S S& in55==k  LN  O  O'S S	
 	
G6   @s6{@@CJ@@  
 
 	 	 	 	 {&-- ! g 
 " 	
  / .  H" 
1

%
-CZ__FJ33F:.@F:*+Ms   AA!Ac                   K   	 d}t          |           rd}t          d| j                    n]| j                                        dk    r)t          |           rd}t          d| j                    nt          d| j                    |rt          |           }nt          |           }t          	                    d| j                  }|	                    d|d d	                   }t          j        j                            d
dddd|dgddid          }|j        d         j        j        }t#          j        |          }|                    d          rd|d         d<   | j        |d<   d|d<   |S # t"          j        $ r"}t+          | j        d|           cY d }~S d }~wt,          $ r,}t+          | j        t/          |                    cY d }~S d }~ww xY w)NFTz[EXTRACT] Image file detected: z.pdfz [EXTRACT] Scanned PDF detected: z[EXTRACT] Text-based file: r+   r,   i@  r2   r3   z~You are a pharma data extraction engine. Extract every field from every row. Never skip data. Return JSON only. No extra text.r4   r7   r-   r8   r   )r9   r;   r<   r=   r>   r*   r@   filename	extractedstatuszparse_error: )r   rA   rB   r   r   r   rc   r   rN   rG   rO   rP   rQ   rR   rS   rT   r6   rU   rV   rW   JSONDecodeError_error_responser   r   )r
   
use_visionrb   r6   promptr`   ra   es           r   extract_documentrm     sM     27
## 	BJDINDDEEEE##%%/ 	BN94M4M 	BJEY^EEFFFF@	@@AAA  	A(33FF (	22G#++L).IIF^^K$@@F{.55 !) $d 
 !'#) 	 "( 7 6  H  "1%-5CZ__Fzz*%% A:@z"#67&^z&x D D Dy~/Bq/B/BCCCCCCCC 7 7 7y~s1vv666666667s0   E4E9 9GF%G%G2!GGGre   errorc                 2    | d|g g g g i g dddddddg g dd
d
S )Nrn   r   )
total_parameterstotal_lims_resultstotal_equipment_readingstotal_maintenance_recordsdeviations_foundmissing_valueslow_confidence_countoperators_foundequipment_foundr@   )
re   rg   rn   
parameterslims_resultsequipment_readingsmaintenance_recordsdeviation_details
sop_limitsr>    )re   rn   s     r   ri   ri   F  sT     ! !"#())* !$%!!!(
 
  r    )osrU   rD   r   pathlibr   openair   file_readerr   dotenvr   getenvrO   rN   boolr   r   dictrc   rm   r   ri   r   r    r   <module>r      su   				                ) ) ) ) ) )       T    			"233	4	4	4BRd t    cT cd c c c c\4 \D \ \ \ \F37d 37t 37 37 37 37lc # $      r    