o
    n j?4                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ edd ee dd	Zd
ZdedefddZdedefddZdedefddZdedefddZdededefddZdS )    N)Path)OpenAI)read_file_content)load_dotenvT)overrideOPENAI_API_KEY)api_keyu{  You are a pharma data extraction engine.

Extract EVERY field available from this document. Do not skip any field.

Return JSON only with this structure:
{
  "filename": string,
  "document_type_code": one of ["SOP","BMR","EQP","LIMS","MAINT","DEV","UNKNOWN"],
  "batch_id": string or null,
  "product_name": string or null,
  "plant": string or null,
  "manufacturing_date": string or null,
  "parameters": [
    {
      "parameter_name": string,
      "observed_value": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "process_step": string or null,
      "equipment_id": string or null,
      "timestamp": string or null,
      "operator_id": string or null,
      "status": one of ["Normal", "DEVIATION", "PASS", "FAIL", "Missing"],
      "notes": string or null,
      "source_reference": string or null,
      "confidence": number 0-100
    }
  ],
  "lims_results": [
    {
      "sample_id": string or null,
      "test_name": string or null,
      "result_value": string or null,
      "specification_low": string or null,
      "specification_high": string or null,
      "unit": string or null,
      "status": string or null,
      "test_date": string or null,
      "analyst_id": string or null,
      "instrument_id": string or null,
      "notes": string or null
    }
  ],
  "equipment_readings": [
    {
      "timestamp": string or null,
      "equipment_id": string or null,
      "parameter": string or null,
      "value": string or null,
      "unit": string or null,
      "setpoint": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "status": string or null,
      "operator_id": string or null,
      "notes": string or null
    }
  ],
  "maintenance_records": [
    {
      "equipment_id": string or null,
      "maintenance_date": string or null,
      "maintenance_type": string or null,
      "performed_by": string or null,
      "duration_hours": string or null,
      "observation": string or null,
      "action_taken": string or null,
      "next_pm_due": string or null,
      "work_order_id": string or null,
      "sign_off_by": string or null,
      "status": string or null
    }
  ],
  "deviation_details": {
    "deviation_id": string or null,
    "severity": string or null,
    "description": string or null,
    "reported_by": string or null,
    "date_of_occurrence": string or null,
    "time_of_occurrence": string or null,
    "immediate_actions": array of strings,
    "equipment_id": string or null,
    "calibration_status": string or null,
    "batch_disposition": string or null,
    "root_cause_preliminary": string or null,
    "linked_records": object or null
  },
  "sop_limits": [
    {
      "process_step": string or null,
      "equipment_id": string or null,
      "parameter_name": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "criticality": string or null,
      "action_if_exceeded": string or null
    }
  ],
  "metadata": {
    "total_parameters": number,
    "total_lims_results": number,
    "total_equipment_readings": number,
    "total_maintenance_records": number,
    "deviations_found": number,
    "missing_values": number,
    "low_confidence_count": number,
    "operators_found": array of strings,
    "equipment_found": array of strings,
    "extraction_method": string
  }
}

Field extraction rules:
- BMR file: fill parameters[] — each row has batch_id, step_name, equipment_id, timestamp, parameter_name, observed_value, lower_limit, upper_limit, unit, status
- Equipment Log file: fill equipment_readings[] — each row has timestamp, equipment_id, parameter, value, unit, setpoint, lower_limit, upper_limit, status, operator_id, notes
- LIMS file: fill lims_results[] — each row has sample_id, batch_id, test_name, result_value, specification_low, specification_high, unit, status, test_date, analyst_id, instrument_id, notes
- Maintenance file: fill maintenance_records[] — each row has equipment_id, maintenance_date, maintenance_type, performed_by, observation, action_taken, next_pm_due, work_order_id, sign_off_by, status
- Deviation JSON: fill deviation_details{} with all fields
- SOP PDF: fill sop_limits[] with all CPP rows

CRITICAL: Never leave operator_id null if it exists in the source data.
CRITICAL: Never leave status null if it exists in the source data.
CRITICAL: Extract ALL rows — do not summarize or skip rows.
CRITICAL: If this is a scanned/handwritten document — read every visible field carefully including stamps, signatures, handwritten values.

Document filename: <filename>
Document content:
<content>
	file_pathreturnc                 C   sX   z!t t| }d}|D ]
}||  7 }q|  t|dk W S  ty+   Y dS w )uN   Returns True if PDF has no extractable text — meaning it is a scanned image. 2   F)fitzopenstrget_textstripcloselen	Exception)r	   doc
total_textpage r   +/var/www/html/fyndo/pharma/fyndo/extract.pyis_scanned_pdf   s   r   c                 C   s   | j  dv S )z!Returns True if file is an image.)z.pngz.jpgz.jpegz.tiffz.tifz.bmpz.webp)suffixlower)r	   r   r   r   is_image_file   s   r   c                 C   s  t d| j  g }t| rIt| d}| }W d   n1 s"w   Y  t| }| j	 
dd}|dkr=d}||d| d	 n5tt| }ttt|d
D ] }|| }|jdd}	|	d}t| }||dd	 qY|  |stdt|}
t d|
 d dd|
 d|
 d|
 d| j dt
d| j
dd 
dg}|D ]}|dd|d  d|d   d!d"d# qtjjjd$d%d&d'd(d)|d(gd*d+id,d-}|jd, jj}t|}|d.i |d.< d/|d. d0< |S )1zg
    Converts scanned PDF pages or images to base64
    and sends to GPT-4o Vision for extraction.
    z0[EXTRACT] Using GPT-4o Vision for scanned file: rbN.r   jpgjpegzimage/)b64mimer   i,  )dpipngz	image/pngzNo images extracted from filez[EXTRACT] Sending z pages to GPT-4o VisiontextzdYou are a pharma data extraction engine specializing in scanned pharma documents.
This document has z pages. Extract data from ALL pages.

CRITICAL INSTRUCTIONS for scanned/handwritten documents:
- Read EVERY cell in every table row across ALL u-   pages — do not skip any row
- CRITICAL: Look for rows with status DEVIATION — these are the most important rows
- CRITICAL: Look for temperature readings above 70°C — these indicate deviations
- For unclear or partially visible text — make your best interpretation based on context
- For handwritten numbers — read carefully, common values are temperatures (60-80°C), weights (kg), percentages
- Never write "(unclear)" — always provide your best reading of the value
- If a value is truly unreadable — use null, not "(unclear)"
- Read ALL ue   pages — extract data from every page not just the first
- For rotated or sideways text — rotate mentally and read it
- For faded text — use context clues from surrounding cells to interpret
- Batch IDs follow pattern like BTC0048, equipment IDs like EQP_GRAN_01
- Status values are typically: Normal, DEVIATION, Alert, Pass, Fail

Document filename: z


<filename>	<content>u@   [See scanned document images above — read every visible field])typer&   	image_urlzdata:r#   z;base64,r"   high)urldetail)r)   r*   gpt-4oip  systemzYou are a pharma data extraction engine. Extract every visible field from scanned documents including handwritten data. Return JSON only.rolecontentuserr)   json_objectr   )model
max_tokensmessagesresponse_formattemperaturemetadatagpt4o_vision_ocrextraction_method) printnamer   r   readbase64	b64encodedecoder   r   replaceappendr   r   rangeminr   
get_pixmaptobytesr   
ValueErrorEXTRACT_PROMPTclientchatcompletionscreatechoicesmessager2   jsonloadsget)r	   
images_b64f	img_bytesr"   extr   page_numr   pixtotal_pagesr2   imgresponserawresultr   r   r   extract_with_vision   s|   




r_   c              
      st  zd}t | rd}td| j  n| j dkr)t| r)d}td| j  ntd| j  |r8t| }nAt| }t	d| j}|	d|d d	 }t
jjjd
dddd|dgddidd}|jd jj}t|}|dryd|d d< | j|d< d|d< |W S  tjy } zt| jd| W  Y d }~S d }~w ty } zt| jt|W  Y d }~S d }~ww )NFTz[EXTRACT] Image file detected: z.pdfz [EXTRACT] Scanned PDF detected: z[EXTRACT] Text-based file: r'   r(   i@  r.   r/   z~You are a pharma data extraction engine. Extract every field from every row. Never skip data. Return JSON only. No extra text.r0   r3   r)   r4   r   )r5   r7   r8   r9   r:   r&   r<   filename	extractedstatuszparse_error: )r   r=   r>   r   r   r   r_   r   rJ   rC   rK   rL   rM   rN   rO   rP   r2   rQ   rR   rS   JSONDecodeError_error_responser   r   )r	   
use_visionr^   r2   promptr\   r]   er   r   r   extract_document  sR   




rh   r`   errorc                 C   s0   | d|g g g g i g dddddddg g dd
d
S )Nri   r   )
total_parameterstotal_lims_resultstotal_equipment_readingstotal_maintenance_recordsdeviations_foundmissing_valueslow_confidence_countoperators_foundequipment_foundr<   )
r`   rb   ri   
parameterslims_resultsequipment_readingsmaintenance_recordsdeviation_details
sop_limitsr:   r   )r`   ri   r   r   r   rd   F  s*   rd   )osrQ   r@   r   pathlibr   openair   file_readerr   dotenvr   getenvrK   rJ   boolr   r   dictr_   rh   r   rd   r   r   r   r   <module>   s"    
 
	c6