Spaces:

ariG23498
/

zero-shot-od

Running on Zero

App Files Files Community

ariG23498 HF Staff commited on Aug 11

Commit

75df5eb

1 Parent(s): cb90111

refactor:

Browse files

Files changed (1) hide show

app.py +171 -155

app.py CHANGED Viewed

@@ -1,230 +1,246 @@
 import gradio as gr
 import spaces
 import torch
 from transformers import (
     AutoProcessor,
     AutoModelForZeroShotObjectDetection,
-    Owlv2ForObjectDetection,
-    OmDetTurboForObjectDetection,
 )
-from PIL import Image
-import time
-def extract_model_short_name(model_id):
     return model_id.split("/")[-1].replace("-", " ").replace("_", " ")
 model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
 processor_llmdet = AutoProcessor.from_pretrained(model_llmdet_id)
-model_llmdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_llmdet_id)
 model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
 processor_mm_grounding = AutoProcessor.from_pretrained(model_mm_grounding_id)
-model_mm_grounding = AutoModelForZeroShotObjectDetection.from_pretrained(
-    model_mm_grounding_id
 )
 model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
 processor_omdet = AutoProcessor.from_pretrained(model_omdet_id)
-model_omdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_omdet_id)
 model_owlv2_id = "google/owlv2-large-patch14-ensemble"
 processor_owlv2 = AutoProcessor.from_pretrained(model_owlv2_id)
-model_owlv2 = AutoModelForZeroShotObjectDetection.from_pretrained(model_owlv2_id)
-model_llmdet_name = extract_model_short_name(model_llmdet_id)
-model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
-model_omdet_name = extract_model_short_name(model_omdet_id)
-model_owlv2_name = extract_model_short_name(model_owlv2_id)
 @spaces.GPU
-def detect(model, processor, image: Image.Image, prompts: list, threshold: float):
     t0 = time.perf_counter()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device).eval()
     texts = [prompts]
-    inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
     with torch.inference_mode():
-        outputs = model(**inputs)
-    results = processor.post_process_grounded_object_detection(
         outputs, threshold=threshold, target_sizes=[image.size[::-1]]
-    )
-    result = results[0]
     annotations = []
-    if isinstance(model, Owlv2ForObjectDetection) or isinstance(
-        model, OmDetTurboForObjectDetection
-    ):
-        key = "labels"
-        check = True
-    else:
-        key = "text_labels"
-        check = False
-    for box, score, label in zip(result["boxes"], result["scores"], result[key]):
-        if score >= threshold:
-            if check:
-                label_id = label
-                label_name = prompts[label_id]
             else:
-                label_name = label
-            xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
-            annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
     elapsed_ms = (time.perf_counter() - t0) * 1000
-    time_taken = f"**Inference time ({model_omdet_name}):** {elapsed_ms:.0f} ms"
     return annotations, time_taken
 def run_detection(
     image: Image.Image,
     prompts_str: str,
-    threshold_llm,
-    threshold_mm,
-    threshold_owlv2,
-    threshold_omdet,
 ):
-    prompts = [p.strip() for p in prompts_str.split(",")]
-    ann_llm, time_llm = detect(
-        model_llmdet, processor_llmdet, image, prompts, threshold_llm
-    )
-    ann_mm, time_mm = detect(
-        model_mm_grounding, processor_mm_grounding, image, prompts, threshold_mm
-    )
-    ann_owlv2, time_owlv2 = detect(
-        model_owlv2, processor_owlv2, image, prompts, threshold_owlv2
-    )
-    ann_omdet, time_omdet = detect(
-        model_omdet, processor_omdet, image, prompts, threshold_omdet
-    )
     return (
-        (image, ann_llm),
-        time_llm,
-        (image, ann_mm),
-        time_mm,
-        (image, ann_owlv2),
-        time_owlv2,
-        (image, ann_omdet),
-        time_omdet,
     )
 with gr.Blocks() as app:
-    gr.Markdown("# Zero-Shot Object Detection Arena")
-    gr.Markdown(
-        "### Compare different zero-shot object detection models on the same image and prompts."
-    )
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload an image", height=400)
             prompts = gr.Textbox(
-                label="Prompts (comma-separated)", value="a cat, a remote control"
             )
             with gr.Accordion("Per-model confidence thresholds", open=True):
-                threshold_llm = gr.Slider(
-                    label="Threshold for LLMDet", minimum=0.0, maximum=1.0, value=0.3
-                )
-                threshold_mm = gr.Slider(
-                    label="Threshold for MM GroundingDINO Tiny",
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.3,
-                )
-                threshold_owlv2 = gr.Slider(
-                    label="Threshold for OwlV2 Large",
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.1,
-                )
-                threshold_omdet = gr.Slider(
-                    label="Threshold for OMDet Turbo Swin Tiny",
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.2,
-                )
             generate_btn = gr.Button(value="Detect")
         with gr.Row():
             with gr.Column(scale=2):
-                output_image_llm = gr.AnnotatedImage(
-                    label=f"Annotated image for {model_llmdet_name}", height=400
-                )
                 output_time_llm = gr.Markdown()
             with gr.Column(scale=2):
-                output_image_mm = gr.AnnotatedImage(
-                    label=f"Annotated image for {model_mm_grounding_name}", height=400
-                )
                 output_time_mm = gr.Markdown()
         with gr.Row():
             with gr.Column(scale=2):
-                output_image_owlv2 = gr.AnnotatedImage(
-                    label=f"Annotated image for {model_owlv2_name}", height=400
-                )
                 output_time_owlv2 = gr.Markdown()
             with gr.Column(scale=2):
-                output_image_omdet = gr.AnnotatedImage(
-                    label=f"Annotated image for {model_omdet_name}", height=400
-                )
                 output_time_omdet = gr.Markdown()
     gr.Markdown("### Examples")
     example_data = [
-        [
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            "a cat, a remote control",
-            0.30,
-            0.30,
-            0.10,
-            0.30,
-        ],
-        [
-            "http://images.cocodataset.org/val2017/000000000139.jpg",
-            "a person, a tv, a remote",
-            0.35,
-            0.30,
-            0.12,
-            0.30,
-        ],
     ]
     gr.Examples(
         examples=example_data,
-        inputs=[
-            image,
-            prompts,
-            threshold_llm,
-            threshold_mm,
-            threshold_owlv2,
-            threshold_omdet,
-        ],
         label="Click an example to populate the inputs",
     )
-    inputs = [
-        image,
-        prompts,
-        threshold_llm,
-        threshold_mm,
-        threshold_owlv2,
-        threshold_omdet,
-    ]
     outputs = [
-        output_image_llm,
-        output_time_llm,
-        output_image_mm,
-        output_time_mm,
-        output_image_owlv2,
-        output_time_owlv2,
-        output_image_omdet,
-        output_time_omdet,
     ]
-    generate_btn.click(
-        fn=run_detection,
-        inputs=inputs,
-        outputs=outputs,
-    )
-    image.upload(
-        fn=run_detection,
-        inputs=inputs,
-        outputs=outputs,
-    )
-app.launch()

+import time
+from dataclasses import dataclass
+from typing import List, Tuple
 import gradio as gr
 import spaces
 import torch
+from PIL import Image
 from transformers import (
     AutoProcessor,
     AutoModelForZeroShotObjectDetection,
 )
+# ---------------------------
+# Setup
+# ---------------------------
+def extract_model_short_name(model_id: str) -> str:
     return model_id.split("/")[-1].replace("-", " ").replace("_", " ")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# (Optional) modest speed-ups
+torch.set_grad_enabled(False)
+# Model bundles for cleaner wiring
+@dataclass
+class ZSDetBundle:
+    model_id: str
+    model_name: str
+    processor: AutoProcessor
+    model: AutoModelForZeroShotObjectDetection
+    use_label_ids: bool  # True for OWLv2/OMDet (labels are indices), False for others
+# LLMDet
 model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
 processor_llmdet = AutoProcessor.from_pretrained(model_llmdet_id)
+model_llmdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_llmdet_id).to(DEVICE).eval()
+bundle_llmdet = ZSDetBundle(
+    model_id=model_llmdet_id,
+    model_name=extract_model_short_name(model_llmdet_id),
+    processor=processor_llmdet,
+    model=model_llmdet,
+    use_label_ids=False,
+)
+# MM GroundingDINO
 model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
 processor_mm_grounding = AutoProcessor.from_pretrained(model_mm_grounding_id)
+model_mm_grounding = AutoModelForZeroShotObjectDetection.from_pretrained(model_mm_grounding_id).to(DEVICE).eval()
+bundle_mm_grounding = ZSDetBundle(
+    model_id=model_mm_grounding_id,
+    model_name=extract_model_short_name(model_mm_grounding_id),
+    processor=processor_mm_grounding,
+    model=model_mm_grounding,
+    use_label_ids=False,
 )
+# OMDet Turbo
 model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
 processor_omdet = AutoProcessor.from_pretrained(model_omdet_id)
+model_omdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_omdet_id).to(DEVICE).eval()
+bundle_omdet = ZSDetBundle(
+    model_id=model_omdet_id,
+    model_name=extract_model_short_name(model_omdet_id),
+    processor=processor_omdet,
+    model=model_omdet,
+    use_label_ids=True,  # returns label indices
+)
+# OWLv2
 model_owlv2_id = "google/owlv2-large-patch14-ensemble"
 processor_owlv2 = AutoProcessor.from_pretrained(model_owlv2_id)
+model_owlv2 = AutoModelForZeroShotObjectDetection.from_pretrained(model_owlv2_id).to(DEVICE).eval()
+bundle_owlv2 = ZSDetBundle(
+    model_id=model_owlv2_id,
+    model_name=extract_model_short_name(model_owlv2_id),
+    processor=processor_owlv2,
+    model=model_owlv2,
+    use_label_ids=True,  # returns label indices
+)
+# ---------------------------
+# Inference
+# ---------------------------
 @spaces.GPU
+def detect(
+    bundle: ZSDetBundle,
+    image: Image.Image,
+    prompts: List[str],
+    threshold: float,
+) -> Tuple[List[Tuple[Tuple[int, int, int, int], str]], str]:
+    """
+    Returns [(bbox, label_score_str), ...], time_str
+    """
     t0 = time.perf_counter()
+    # HF zero-shot OD expects list-of-list text
     texts = [prompts]
+    inputs = bundle.processor(images=image, text=texts, return_tensors="pt").to(DEVICE)
     with torch.inference_mode():
+        if DEVICE == "cuda":
+            # Use autocast to speed up mixed-precision-friendly ops
+            with torch.amp.autocast():
+                outputs = bundle.model(**inputs)
+        else:
+            outputs = bundle.model(**inputs)
+    results = bundle.processor.post_process_grounded_object_detection(
         outputs, threshold=threshold, target_sizes=[image.size[::-1]]
+    )[0]
     annotations = []
+    key = "labels" if bundle.use_label_ids else "text_labels"
+    for box, score, label in zip(results["boxes"], results["scores"], results[key]):
+        if float(score) < threshold:
+            continue
+        if bundle.use_label_ids:
+            # Map label index -> prompt string
+            label_idx = int(label) if isinstance(label, torch.Tensor) else int(label)
+            if 0 <= label_idx < len(prompts):
+                label_name = prompts[label_idx]
             else:
+                label_name = str(label_idx)
+        else:
+            # Direct text label
+            label_name = label if isinstance(label, str) else str(label)
+        xmin, ymin, xmax, ymax = map(lambda v: int(v), box.tolist())
+        annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {float(score):.2f}"))
     elapsed_ms = (time.perf_counter() - t0) * 1000
+    time_taken = f"**Inference time ({bundle.model_name}):** {elapsed_ms:.0f} ms"
     return annotations, time_taken
+def parse_prompts(prompts_str: str) -> List[str]:
+    return [p.strip() for p in prompts_str.split(",") if p.strip()]
 def run_detection(
     image: Image.Image,
     prompts_str: str,
+    threshold_llm: float,
+    threshold_mm: float,
+    threshold_owlv2: float,
+    threshold_omdet: float,
 ):
+    prompts = parse_prompts(prompts_str)
+    ann_llm, time_llm = detect(bundle_llmdet, image, prompts, threshold_llm)
+    ann_mm, time_mm = detect(bundle_mm_grounding, image, prompts, threshold_mm)
+    ann_owlv2, time_owlv2 = detect(bundle_owlv2, image, prompts, threshold_owlv2)
+    ann_omdet, time_omdet = detect(bundle_omdet, image, prompts, threshold_omdet)
     return (
+        (image, ann_llm), time_llm,
+        (image, ann_mm), time_mm,
+        (image, ann_owlv2), time_owlv2,
+        (image, ann_omdet), time_omdet,
     )
+# ---------------------------
+# Compact Description
+# ---------------------------
+description_md = """
+# Zero-Shot Object Detection Arena
+Compare **four zero-shot object detectors** on the same image + prompts.
+Upload an image (or pick an example), add **comma-separated prompts**, tweak per-model **thresholds**, and hit **Detect**.
+You'll see bounding boxes, scores, and **per-model inference time**.
+**Models**
+- LLMDet Tiny — [`iSEE-Laboratory/llmdet_tiny`](https://huggingface.co/iSEE-Laboratory/llmdet_tiny)
+- MM GroundingDINO Tiny O365v1 GoldG — [`rziga/mm_grounding_dino_tiny_o365v1_goldg`](https://huggingface.co/rziga/mm_grounding_dino_tiny_o365v1_goldg)
+- OMDet Turbo Swin Tiny — [`omlab/omdet-turbo-swin-tiny-hf`](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf)
+- OWL-V2 Large Patch14 Ensemble — [`google/owlv2-large-patch14-ensemble`](https://huggingface.co/google/owlv2-large-patch14-ensemble)
+**Tip:** Lower thresholds ↑ recall but may ↑ false positives.
+"""
+# ---------------------------
+# UI
+# ---------------------------
 with gr.Blocks() as app:
+    gr.Markdown(description_md)
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload an image", height=400)
             prompts = gr.Textbox(
+                label="Prompts (comma-separated)",
+                value="a cat, a remote control",
+                placeholder="e.g., a cat, a remote control",
             )
             with gr.Accordion("Per-model confidence thresholds", open=True):
+                threshold_llm = gr.Slider(label=f"Threshold — {bundle_llmdet.model_name}", minimum=0.0, maximum=1.0, value=0.3)
+                threshold_mm = gr.Slider(label=f"Threshold — {bundle_mm_grounding.model_name}", minimum=0.0, maximum=1.0, value=0.3)
+                threshold_owlv2 = gr.Slider(label=f"Threshold — {bundle_owlv2.model_name}", minimum=0.0, maximum=1.0, value=0.1)
+                threshold_omdet = gr.Slider(label=f"Threshold — {bundle_omdet.model_name}", minimum=0.0, maximum=1.0, value=0.2)
             generate_btn = gr.Button(value="Detect")
         with gr.Row():
             with gr.Column(scale=2):
+                output_image_llm = gr.AnnotatedImage(label=f"Annotated — {bundle_llmdet.model_name}", height=400)
                 output_time_llm = gr.Markdown()
             with gr.Column(scale=2):
+                output_image_mm = gr.AnnotatedImage(label=f"Annotated — {bundle_mm_grounding.model_name}", height=400)
                 output_time_mm = gr.Markdown()
         with gr.Row():
             with gr.Column(scale=2):
+                output_image_owlv2 = gr.AnnotatedImage(label=f"Annotated — {bundle_owlv2.model_name}", height=400)
                 output_time_owlv2 = gr.Markdown()
             with gr.Column(scale=2):
+                output_image_omdet = gr.AnnotatedImage(label=f"Annotated — {bundle_omdet.model_name}", height=400)
                 output_time_omdet = gr.Markdown()
     gr.Markdown("### Examples")
     example_data = [
+        ["https://images.cocodataset.org/val2017/000000039769.jpg", "a cat, a remote control", 0.30, 0.30, 0.10, 0.30],
+        ["https://images.cocodataset.org/val2017/000000000139.jpg", "a person, a tv, a remote", 0.35, 0.30, 0.12, 0.30],
     ]
     gr.Examples(
         examples=example_data,
+        inputs=[image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet],
         label="Click an example to populate the inputs",
     )
+    inputs = [image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet]
     outputs = [
+        output_image_llm, output_time_llm,
+        output_image_mm, output_time_mm,
+        output_image_owlv2, output_time_owlv2,
+        output_image_omdet, output_time_omdet,
     ]
+    generate_btn.click(fn=run_detection, inputs=inputs, outputs=outputs)
+    image.upload(fn=run_detection, inputs=inputs, outputs=outputs)
+# Optional: queue to handle multiple users gracefully (tune as needed)
+app.queue(max_size=16, concurrency_count=1).launch()