import gradio as gr
import onnxruntime
import cv2
import numpy as np
import json
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

MODEL_PATH = "ssd_mobilenet_v1_12-int8.onnx"
LABELS_PATH = "labels_coco_90.json"
device = "cpu"

# ===== Load labels =====
with open(LABELS_PATH, "r") as f:
    raw_labels = json.load(f)

labels = []
if isinstance(raw_labels, dict):
    for k in sorted(raw_labels.keys(), key=lambda x: int(x)):
        labels.append(raw_labels[k])
else:
    labels = raw_labels

# ===== Load SSD ONNX =====
sess = onnxruntime.InferenceSession(
    MODEL_PATH,
    providers=["CPUExecutionProvider"],
)

# ===== Load OneVision =====
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
caption_model = LlavaOnevisionForConditionalGeneration.from_pretrained(
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
).to(device)
caption_model.eval()


# ===== SSD Detection =====
def ssd_detect(img):
    resized = cv2.resize(img, (300, 300))
    blob = np.expand_dims(resized, axis=0).astype(np.uint8)

    outputs = sess.run([o.name for o in sess.get_outputs()], {"inputs": blob})
    det = outputs[0]

    if det.ndim == 3:
        det = det[0]
    elif det.ndim == 4:
        det = det[0][0]

    h, w = img.shape[:2]
    results = []
    THRESH = 0.4

    for row in det:
        if len(row) == 7:
            cid = int(row[1])
            conf = float(row[2])
            xmin, ymin, xmax, ymax = row[3:7]

        elif len(row) == 6:
            cid = int(row[0])
            conf = float(row[1])
            xmin, ymin, xmax, ymax = row[2:6]

        elif len(row) == 4:
            continue

        else:
            continue

        if conf < THRESH:
            continue
        if cid < 0 or cid >= len(labels):
            continue

        results.append({
            "label": labels[cid],
            "score": conf,
            "bbox": [
                int(xmin * w),
                int(ymin * h),
                int(xmax * w),
                int(ymax * h)
            ]
        })

    return results


# ===== Caption Generation =====
def generate_caption(pil_img):
    prompt = (
        "Provide a short accident report including damage severity "
        "(mild/moderate/severe), broken parts, debris, leaking fluids, "
        "vehicle positions, blocked lanes, and people involved."
    )

    inputs = processor(
        images=[pil_img],
        text=prompt,
        return_tensors="pt"
    ).to(device)

    output = caption_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.2,
        do_sample=False
    )

    caption = processor.decode(output[0], skip_special_tokens=True).strip()
    return caption


# ===== Combined Pipeline =====
def detect_objects(pil_image):
    if pil_image is None:
        return {"error": "No image"}

    img_bgr = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    objects = ssd_detect(img_bgr)
    caption = generate_caption(pil_image)

    return {
        "caption": caption,
        "objects": objects
    }


# ===== Gradio UI =====
ui = gr.Interface(
    fn=detect_objects,
    inputs=gr.Image(type="pil"),
    outputs="json",
    title="Accident Analyzer (MobileNet SSD + OneVision)",
    description="Detect objects and generate an AI accident summary using CPU."
)

if __name__ == "__main__":
    ui.launch()