import os
import json
import base64
import fitz  # PyMuPDF
from pathlib import Path
from openai import OpenAI
from file_reader import read_file_content
from dotenv import load_dotenv

load_dotenv(override=True)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

EXTRACT_PROMPT = """You are a pharma data extraction engine.

Extract EVERY field available from this document. Do not skip any field.

Return JSON only with this structure:
{
  "filename": string,
  "document_type_code": one of ["SOP","BMR","EQP","LIMS","MAINT","DEV","UNKNOWN"],
  "batch_id": string or null,
  "product_name": string or null,
  "plant": string or null,
  "manufacturing_date": string or null,
  "parameters": [
    {
      "parameter_name": string,
      "observed_value": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "process_step": string or null,
      "equipment_id": string or null,
      "timestamp": string or null,
      "operator_id": string or null,
      "status": one of ["Normal", "DEVIATION", "PASS", "FAIL", "Missing"],
      "notes": string or null,
      "source_reference": string or null,
      "confidence": number 0-100
    }
  ],
  "lims_results": [
    {
      "sample_id": string or null,
      "test_name": string or null,
      "result_value": string or null,
      "specification_low": string or null,
      "specification_high": string or null,
      "unit": string or null,
      "status": string or null,
      "test_date": string or null,
      "analyst_id": string or null,
      "instrument_id": string or null,
      "notes": string or null
    }
  ],
  "equipment_readings": [
    {
      "timestamp": string or null,
      "equipment_id": string or null,
      "parameter": string or null,
      "value": string or null,
      "unit": string or null,
      "setpoint": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "status": string or null,
      "operator_id": string or null,
      "notes": string or null
    }
  ],
  "maintenance_records": [
    {
      "equipment_id": string or null,
      "maintenance_date": string or null,
      "maintenance_type": string or null,
      "performed_by": string or null,
      "duration_hours": string or null,
      "observation": string or null,
      "action_taken": string or null,
      "next_pm_due": string or null,
      "work_order_id": string or null,
      "sign_off_by": string or null,
      "status": string or null
    }
  ],
  "deviation_details": {
    "deviation_id": string or null,
    "severity": string or null,
    "description": string or null,
    "reported_by": string or null,
    "date_of_occurrence": string or null,
    "time_of_occurrence": string or null,
    "immediate_actions": array of strings,
    "equipment_id": string or null,
    "calibration_status": string or null,
    "batch_disposition": string or null,
    "root_cause_preliminary": string or null,
    "linked_records": object or null
  },
  "sop_limits": [
    {
      "process_step": string or null,
      "equipment_id": string or null,
      "parameter_name": string or null,
      "lower_limit": string or null,
      "upper_limit": string or null,
      "unit": string or null,
      "criticality": string or null,
      "action_if_exceeded": string or null
    }
  ],
  "metadata": {
    "total_parameters": number,
    "total_lims_results": number,
    "total_equipment_readings": number,
    "total_maintenance_records": number,
    "deviations_found": number,
    "missing_values": number,
    "low_confidence_count": number,
    "operators_found": array of strings,
    "equipment_found": array of strings,
    "extraction_method": string
  }
}

Field extraction rules:
- BMR file: fill parameters[] — each row has batch_id, step_name, equipment_id, timestamp, parameter_name, observed_value, lower_limit, upper_limit, unit, status
- Equipment Log file: fill equipment_readings[] — each row has timestamp, equipment_id, parameter, value, unit, setpoint, lower_limit, upper_limit, status, operator_id, notes
- LIMS file: fill lims_results[] — each row has sample_id, batch_id, test_name, result_value, specification_low, specification_high, unit, status, test_date, analyst_id, instrument_id, notes
- Maintenance file: fill maintenance_records[] — each row has equipment_id, maintenance_date, maintenance_type, performed_by, observation, action_taken, next_pm_due, work_order_id, sign_off_by, status
- Deviation JSON: fill deviation_details{} with all fields
- SOP PDF: fill sop_limits[] with all CPP rows

CRITICAL: Never leave operator_id null if it exists in the source data.
CRITICAL: Never leave status null if it exists in the source data.
CRITICAL: Extract ALL rows — do not summarize or skip rows.
CRITICAL: If this is a scanned/handwritten document — read every visible field carefully including stamps, signatures, handwritten values.

Document filename: <filename>
Document content:
<content>
"""


# ─────────────────────────────────────────
# HELPER — detect if PDF is scanned
# ─────────────────────────────────────────

def is_scanned_pdf(file_path: Path) -> bool:
    """Returns True if PDF has no extractable text — meaning it is a scanned image."""
    try:
        doc = fitz.open(str(file_path))
        total_text = ""
        for page in doc:
            total_text += page.get_text().strip()
        doc.close()
        # If less than 50 chars across all pages — it is scanned
        return len(total_text) < 50
    except Exception:
        return False


def is_image_file(file_path: Path) -> bool:
    """Returns True if file is an image."""
    return file_path.suffix.lower() in [".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"]


# ─────────────────────────────────────────
# EXTRACT SCANNED PDF or IMAGE via GPT-4o Vision
# ─────────────────────────────────────────

def extract_with_vision(file_path: Path) -> dict:
    """
    Converts scanned PDF pages or images to base64
    and sends to GPT-4o Vision for extraction.
    """
    print(f"[EXTRACT] Using GPT-4o Vision for scanned file: {file_path.name}")
    images_b64 = []

    if is_image_file(file_path):
        # Direct image file
        with open(file_path, "rb") as f:
            img_bytes = f.read()
        b64 = base64.b64encode(img_bytes).decode()
        ext = file_path.suffix.lower().replace(".", "")
        if ext == "jpg":
            ext = "jpeg"
        images_b64.append({"b64": b64, "mime": f"image/{ext}"})
    else:
        # Scanned PDF — convert each page to image
        doc = fitz.open(str(file_path))
        for page_num in range(min(len(doc), 50)):  # max 10 pages
            page = doc[page_num]
            pix = page.get_pixmap(dpi=300)
            img_bytes = pix.tobytes("png")
            b64 = base64.b64encode(img_bytes).decode()
            images_b64.append({"b64": b64, "mime": "image/png"})
        doc.close()

    if not images_b64:
        raise ValueError("No images extracted from file")

    total_pages = len(images_b64)
    print(f"[EXTRACT] Sending {total_pages} pages to GPT-4o Vision")

    # Build vision message
    content = [
        {
            "type": "text",
            "text": f"""You are a pharma data extraction engine specializing in scanned pharma documents.
This document has {total_pages} pages. Extract data from ALL pages.

CRITICAL INSTRUCTIONS for scanned/handwritten documents:
- Read EVERY cell in every table row across ALL {total_pages} pages — do not skip any row
- CRITICAL: Look for rows with status DEVIATION — these are the most important rows
- CRITICAL: Look for temperature readings above 70°C — these indicate deviations
- For unclear or partially visible text — make your best interpretation based on context
- For handwritten numbers — read carefully, common values are temperatures (60-80°C), weights (kg), percentages
- Never write "(unclear)" — always provide your best reading of the value
- If a value is truly unreadable — use null, not "(unclear)"
- Read ALL {total_pages} pages — extract data from every page not just the first
- For rotated or sideways text — rotate mentally and read it
- For faded text — use context clues from surrounding cells to interpret
- Batch IDs follow pattern like BTC0048, equipment IDs like EQP_GRAN_01
- Status values are typically: Normal, DEVIATION, Alert, Pass, Fail

Document filename: {file_path.name}

{EXTRACT_PROMPT.replace('<filename>', file_path.name).replace('<content>', '[See scanned document images above — read every visible field]')}"""
        }
    ]

    # Add all page images
    for img in images_b64:
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:{img['mime']};base64,{img['b64']}",
                "detail": "high"
            }
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=6000,
        messages=[
            {
                "role": "system",
                "content": "You are a pharma data extraction engine. Extract every visible field from scanned documents including handwritten data. Return JSON only."
            },
            {
                "role": "user",
                "content": content
            }
        ],
        response_format={"type": "json_object"},
        temperature=0
    )

    raw = response.choices[0].message.content
    result = json.loads(raw)
    result["metadata"] = result.get("metadata", {})
    result["metadata"]["extraction_method"] = "gpt4o_vision_ocr"
    return result


# ─────────────────────────────────────────
# MAIN EXTRACT FUNCTION
# ─────────────────────────────────────────

async def extract_document(file_path: Path) -> dict:
    try:
        # Step 1 — detect if scanned image or normal text document
        use_vision = False

        if is_image_file(file_path):
            use_vision = True
            print(f"[EXTRACT] Image file detected: {file_path.name}")
        elif file_path.suffix.lower() == ".pdf" and is_scanned_pdf(file_path):
            use_vision = True
            print(f"[EXTRACT] Scanned PDF detected: {file_path.name}")
        else:
            print(f"[EXTRACT] Text-based file: {file_path.name}")

        # Step 2 — extract using correct method
        if use_vision:
            result = extract_with_vision(file_path)
        else:
            # Normal text extraction
            content = read_file_content(file_path)
            prompt = EXTRACT_PROMPT.replace("<filename>", file_path.name)
            prompt = prompt.replace("<content>", content[:8000])

            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a pharma data extraction engine. Extract every field from every row. Never skip data. Return JSON only. No extra text."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                response_format={"type": "json_object"},
                temperature=0
            )

            raw = response.choices[0].message.content
            result = json.loads(raw)
            if result.get("metadata"):
                result["metadata"]["extraction_method"] = "text"

        result["filename"] = file_path.name
        result["status"] = "extracted"
        return result

    except json.JSONDecodeError as e:
        return _error_response(file_path.name, f"parse_error: {e}")
    except Exception as e:
        return _error_response(file_path.name, str(e))


def _error_response(filename: str, error: str) -> dict:
    return {
        "filename": filename,
        "status": "error",
        "error": error,
        "parameters": [],
        "lims_results": [],
        "equipment_readings": [],
        "maintenance_records": [],
        "deviation_details": {},
        "sop_limits": [],
        "metadata": {
            "total_parameters": 0,
            "total_lims_results": 0,
            "total_equipment_readings": 0,
            "total_maintenance_records": 0,
            "deviations_found": 0,
            "missing_values": 0,
            "low_confidence_count": 0,
            "operators_found": [],
            "equipment_found": [],
            "extraction_method": "error"
        }
    }