import gradio as gr import onnxruntime import cv2 import numpy as np import json import torch from PIL import Image from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration MODEL_PATH = "ssd_mobilenet_v1_12-int8.onnx" LABELS_PATH = "labels_coco_90.json" device = "cpu" # ===== Load labels ===== with open(LABELS_PATH, "r") as f: raw_labels = json.load(f) labels = [] if isinstance(raw_labels, dict): for k in sorted(raw_labels.keys(), key=lambda x: int(x)): labels.append(raw_labels[k]) else: labels = raw_labels # ===== Load SSD ONNX ===== sess = onnxruntime.InferenceSession( MODEL_PATH, providers=["CPUExecutionProvider"], ) # ===== Load OneVision ===== processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") caption_model = LlavaOnevisionForConditionalGeneration.from_pretrained( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", torch_dtype=torch.float32, low_cpu_mem_usage=True ).to(device) caption_model.eval() # ===== SSD Detection ===== def ssd_detect(img): resized = cv2.resize(img, (300, 300)) blob = np.expand_dims(resized, axis=0).astype(np.uint8) outputs = sess.run([o.name for o in sess.get_outputs()], {"inputs": blob}) det = outputs[0] if det.ndim == 3: det = det[0] elif det.ndim == 4: det = det[0][0] h, w = img.shape[:2] results = [] THRESH = 0.4 for row in det: if len(row) == 7: cid = int(row[1]) conf = float(row[2]) xmin, ymin, xmax, ymax = row[3:7] elif len(row) == 6: cid = int(row[0]) conf = float(row[1]) xmin, ymin, xmax, ymax = row[2:6] elif len(row) == 4: continue else: continue if conf < THRESH: continue if cid < 0 or cid >= len(labels): continue results.append({ "label": labels[cid], "score": conf, "bbox": [ int(xmin * w), int(ymin * h), int(xmax * w), int(ymax * h) ] }) return results # ===== Caption Generation ===== def generate_caption(pil_img): prompt = ( "Provide a short accident report including damage severity " "(mild/moderate/severe), broken parts, debris, leaking fluids, " "vehicle positions, blocked lanes, and people involved." ) inputs = processor( images=[pil_img], text=prompt, return_tensors="pt" ).to(device) output = caption_model.generate( **inputs, max_new_tokens=150, temperature=0.2, do_sample=False ) caption = processor.decode(output[0], skip_special_tokens=True).strip() return caption # ===== Combined Pipeline ===== def detect_objects(pil_image): if pil_image is None: return {"error": "No image"} img_bgr = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) objects = ssd_detect(img_bgr) caption = generate_caption(pil_image) return { "caption": caption, "objects": objects } # ===== Gradio UI ===== ui = gr.Interface( fn=detect_objects, inputs=gr.Image(type="pil"), outputs="json", title="Accident Analyzer (MobileNet SSD + OneVision)", description="Detect objects and generate an AI accident summary using CPU." ) if __name__ == "__main__": ui.launch()