Spaces:

atalaydenknalbant
/

DINOv3

Running on Zero

App Files Files Community

atalaydenknalbant commited on Aug 15

Commit

7fe18c2

verified ·

1 Parent(s): 5e9def7

Update app.py

Browse files

Files changed (1) hide show

app.py +233 -70

app.py CHANGED Viewed

@@ -2,17 +2,17 @@ import os
 import io
 import time
 import json
 import tempfile
 from uuid import uuid4
-from typing import List, Tuple
 import gradio as gr
 from PIL import Image
 import numpy as np
 import torch
 from transformers import AutoImageProcessor, AutoModel
-import spaces
 # ---------------------------
 # Models and config
@@ -27,7 +27,6 @@ DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
 HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets after requesting gated access
 # ---------------------------
 # ZeroGPU booking helpers
 # ---------------------------
@@ -40,6 +39,9 @@ def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     n = max(1, len(files) if files else 1)
     return min(600, 35 * n + 30)
 # ---------------------------
 # Model loading and core logic
@@ -47,9 +49,7 @@ def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
 def _load(model_id: str):
     # Use token for gated checkpoints
     processor = AutoImageProcessor.from_pretrained(
-        model_id,
-        use_fast=True,
-        token=HF_TOKEN if HF_TOKEN else None,
     )
     model = AutoModel.from_pretrained(
         model_id,
@@ -60,6 +60,11 @@ def _load(model_id: str):
     model.to("cuda").eval()
     return processor, model
 def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay: bool):
     """
@@ -68,50 +73,43 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
     t0 = time.time()
     processor, model = _load(model_id)
-    # Keep BatchFeature when possible, but handle dict too
     bf = processor(images=image, return_tensors="pt")
-    if hasattr(bf, "to"):
-        bf = bf.to("cuda")
-        pixel_values = bf["pixel_values"]
-    else:
-        bf = {k: v.to("cuda") for k, v in bf.items()}
-        pixel_values = bf["pixel_values"]
     with torch.amp.autocast("cuda", dtype=torch.float16), torch.inference_mode():
         out = model(**bf)
-    # Embedding pooling
-    if pooling == "CLS":
-        if getattr(out, "pooler_output", None) is not None:
-            emb = out.pooler_output[0]
         else:
-            emb = out.last_hidden_state[0, 0]
-    else:
-        # mean of patch tokens or mean over H,W for conv features
-        if out.last_hidden_state.ndim == 3:
-            num_regs = getattr(model.config, "num_register_tokens", 0)
-            patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
-            emb = patch_tokens.mean(dim=0)
-        else:
-            feat = out.last_hidden_state[0]  # [C,H,W]
-            emb = feat.mean(dim=(1, 2))
     emb = emb.float().cpu().numpy()
     # Optional simple heat overlay for ViT
     overlay = None
-    if want_overlay and out.last_hidden_state.ndim == 3:
         num_regs = getattr(model.config, "num_register_tokens", 0)
-        patch_tokens = out.last_hidden_state[0, 1 + num_regs :]   # [N_patches, D]
         num_patches = patch_tokens.shape[0]
-        # Prefer square grid from token count, else fall back to pixel/patch size
         h = int(num_patches ** 0.5)
         w = h
         if h * w != num_patches:
             patch = getattr(model.config, "patch_size", 16)
             h = int(pixel_values.shape[-2] // patch)
             w = int(pixel_values.shape[-1] // patch)
         mags = patch_tokens.norm(dim=1).reshape(h, w)
         mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
         m = (mags.cpu().numpy() * 255).astype(np.uint8)
@@ -126,32 +124,8 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
     }
     return emb, overlay, meta
 # ---------------------------
-# Single image API (ZeroGPU)
-# ---------------------------
-@spaces.GPU(duration=_gpu_duration_single)
-def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
-    if image is None:
-        return None, "[]", {"error": "No image"}, None
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
-    model_id = MODELS[model_name]
-    emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
-    # Preview + file save for gr.File
-    head = ", ".join(f"{x:.4f}" for x in emb[:16])
-    preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
-    out_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
-    np.save(out_path, emb.astype(np.float32), allow_pickle=False)
-    # Return: gr.Image, gr.Textbox, gr.JSON, gr.File
-    return overlay if overlay else image, preview, meta, out_path
-# ---------------------------
-# Multi image similarity (ZeroGPU)
 # ---------------------------
 def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
     imgs: List[Image.Image] = []
@@ -166,7 +140,9 @@ def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     # simple accessible HTML table render
     names_safe = [os.path.basename(n) for n in names]
-    header = "<tr><th></th>" + "".join(f"<th style='padding:6px 8px;text-align:center'>{n}</th>" for n in names_safe) + "</tr>"
     rows = []
     for i, r in enumerate(S):
         cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
@@ -181,7 +157,32 @@ def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     """
     return table
 @spaces.GPU(duration=_gpu_duration_gallery)
 def batch_similarity(files: List[str], model_name: str, pooling: str):
     paths = files or []
@@ -189,33 +190,136 @@ def batch_similarity(files: List[str], model_name: str, pooling: str):
         return "<em>Upload at least 2 images</em>", None
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA not available. Ensure ZeroGPU is selected.")
     model_id = MODELS[model_name]
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
         e, _, _ = _extract_core(img, model_id, pooling, want_overlay=False)
         embs.append(e)
     if len(embs) < 2:
         return "<em>Failed to read or embed images</em>", None
     X = np.vstack(embs).astype(np.float32)
-    Xn = X / np.clip(np.linalg.norm(X, axis=1, keepdims=True), 1e-8, None)
     S = Xn @ Xn.T
     # save CSV and build HTML table
     csv_path = os.path.join(tempfile.gettempdir(), f"cosine_{uuid4().hex}.csv")
     np.savetxt(csv_path, S, delimiter=",", fmt="%.6f")
     html = _to_html_table(S, paths)
     return html, csv_path
 # ---------------------------
 # UI
 # ---------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# DINOv3  Embeddings  Similarity  Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
@@ -234,7 +338,8 @@ with gr.Blocks() as app:
         url={[https://arxiv.org/abs/2508.10104}](https://arxiv.org/abs/2508.10104}),
         }
         ```  """)
     with gr.Tab("Single"):
         with gr.Row():
             with gr.Column():
@@ -248,19 +353,31 @@ with gr.Blocks() as app:
                 preview = gr.Textbox(label="Embedding head", max_lines=2)
                 meta = gr.JSON(label="Meta")
                 download = gr.File(label="embedding.npy")
         run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
     with gr.Tab("Cosine Sim"):
         gr.Markdown("Upload multiple images. We compute a cosine similarity matrix on GPU and return a CSV.")
-        # Input as Files so you can multi-upload, plus a Gallery preview
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
         pooling2 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
         go = gr.Button("Compute cosine on GPU")
-        table = gr.HTML(label="Cosine similarity")  # display matrix here
         csv = gr.File(label="cosine_similarity_matrix.csv")
         def _preview(paths):
             if not paths:
                 return []
@@ -271,11 +388,57 @@ with gr.Blocks() as app:
                 except Exception:
                     pass
             return imgs
         files_in.change(_preview, inputs=files_in, outputs=gallery_preview)
         go.click(batch_similarity, [files_in, model_dd2, pooling2], [table, csv])
 if __name__ == "__main__":
     app.queue().launch()

 import io
 import time
 import json
+import glob
 import tempfile
 from uuid import uuid4
+from typing import List, Tuple, Dict
 import gradio as gr
 from PIL import Image
 import numpy as np
 import torch
 from transformers import AutoImageProcessor, AutoModel
+import spaces
 # ---------------------------
 # Models and config
 HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets after requesting gated access
 # ---------------------------
 # ZeroGPU booking helpers
 # ---------------------------
     n = max(1, len(files) if files else 1)
     return min(600, 35 * n + 30)
+def _gpu_duration_classify(*_args, **_kwargs) -> int:
+    # small buffer for 1 query plus a handful of centroids
+    return 90
 # ---------------------------
 # Model loading and core logic
 def _load(model_id: str):
     # Use token for gated checkpoints
     processor = AutoImageProcessor.from_pretrained(
+        model_id, use_fast=True, token=HF_TOKEN if HF_TOKEN else None,
     )
     model = AutoModel.from_pretrained(
         model_id,
     model.to("cuda").eval()
     return processor, model
+def _to_cuda_batchfeature(bf):
+    # Keep BatchFeature when possible, but handle dict too
+    if hasattr(bf, "to"):
+        return bf.to("cuda")
+    return {k: v.to("cuda") for k, v in bf.items()}
 def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay: bool):
     """
     t0 = time.time()
     processor, model = _load(model_id)
     bf = processor(images=image, return_tensors="pt")
+    bf = _to_cuda_batchfeature(bf)
+    pixel_values = bf["pixel_values"]
     with torch.amp.autocast("cuda", dtype=torch.float16), torch.inference_mode():
         out = model(**bf)
+        # Embedding pooling
+        if pooling == "CLS":
+            if getattr(out, "pooler_output", None) is not None:
+                emb = out.pooler_output[0]
+            else:
+                emb = out.last_hidden_state[0, 0]
         else:
+            # mean of patch tokens or mean over H W for conv features
+            if out.last_hidden_state.ndim == 3:
+                num_regs = getattr(model.config, "num_register_tokens", 0)
+                patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
+                emb = patch_tokens.mean(dim=0)
+            else:
+                feat = out.last_hidden_state[0]  # [C,H,W]
+                emb = feat.mean(dim=(1, 2))
     emb = emb.float().cpu().numpy()
     # Optional simple heat overlay for ViT
     overlay = None
+    if want_overlay and getattr(out, "last_hidden_state", None) is not None and out.last_hidden_state.ndim == 3:
         num_regs = getattr(model.config, "num_register_tokens", 0)
+        patch_tokens = out.last_hidden_state[0, 1 + num_regs :]  # [N_patches, D]
         num_patches = patch_tokens.shape[0]
         h = int(num_patches ** 0.5)
         w = h
         if h * w != num_patches:
             patch = getattr(model.config, "patch_size", 16)
             h = int(pixel_values.shape[-2] // patch)
             w = int(pixel_values.shape[-1] // patch)
         mags = patch_tokens.norm(dim=1).reshape(h, w)
         mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
         m = (mags.cpu().numpy() * 255).astype(np.uint8)
     }
     return emb, overlay, meta
 # ---------------------------
+# Utilities
 # ---------------------------
 def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
     imgs: List[Image.Image] = []
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     # simple accessible HTML table render
     names_safe = [os.path.basename(n) for n in names]
+    header = "<tr><th></th>" + "".join(
+        f"<th style='padding:6px 8px;text-align:center'>{n}</th>" for n in names_safe
+    ) + "</tr>"
     rows = []
     for i, r in enumerate(S):
         cells = "".join(f"<td style='padding:6px 8px;text-align:center'>{v:.3f}</td>" for v in r)
     """
     return table
+def _normalize_rows(X: np.ndarray) -> np.ndarray:
+    n = np.linalg.norm(X, axis=1, keepdims=True)
+    return X / np.clip(n, 1e-8, None)
+# ---------------------------
+# Single image API  ZeroGPU
+# ---------------------------
+@spaces.GPU(duration=_gpu_duration_single)
+def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
+    if image is None:
+        return None, "[]", {"error": "No image"}, None
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
+    model_id = MODELS[model_name]
+    emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
+    # Preview + file save for gr.File
+    head = ", ".join(f"{x:.4f}" for x in emb[:16])
+    preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
+    out_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
+    np.save(out_path, emb.astype(np.float32), allow_pickle=False)
+    # Return: gr.Image, gr.Textbox, gr.JSON, gr.File
+    return overlay if overlay else image, preview, meta, out_path
+# ---------------------------
+# Multi image similarity  ZeroGPU
+# ---------------------------
 @spaces.GPU(duration=_gpu_duration_gallery)
 def batch_similarity(files: List[str], model_name: str, pooling: str):
     paths = files or []
         return "<em>Upload at least 2 images</em>", None
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA not available. Ensure ZeroGPU is selected.")
     model_id = MODELS[model_name]
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
         e, _, _ = _extract_core(img, model_id, pooling, want_overlay=False)
         embs.append(e)
     if len(embs) < 2:
         return "<em>Failed to read or embed images</em>", None
     X = np.vstack(embs).astype(np.float32)
+    Xn = _normalize_rows(X)
     S = Xn @ Xn.T
     # save CSV and build HTML table
     csv_path = os.path.join(tempfile.gettempdir(), f"cosine_{uuid4().hex}.csv")
     np.savetxt(csv_path, S, delimiter=",", fmt="%.6f")
     html = _to_html_table(S, paths)
     return html, csv_path
+# ---------------------------
+# Image Classification using DINOv3 embeddings
+# Few shot nearest centroid on GPU
+# ---------------------------
+# State format:
+# state = {
+#   "model_id": str,
+#   "pooling": str,
+#   "classes": { "cat": {"embs": np.ndarray[Nc, D], "count": int}, ... }
+# }
+def _init_state() -> Dict:
+    return {"model_id": "", "pooling": "", "classes": {}}
+def _summarize_state(state: Dict) -> Dict:
+    return {
+        "model_id": state.get("model_id", ""),
+        "pooling": state.get("pooling", ""),
+        "class_counts": {k: v.get("count", 0) for k, v in state.get("classes", {}).items()},
+        "num_classes": len(state.get("classes", {})),
+        "total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
+    }
+@spaces.GPU(duration=_gpu_duration_gallery)
+def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available. Ensure ZeroGPU is selected.")
+    if not class_name.strip():
+        return {"error": "Class name is empty"}, state
+    if not files:
+        return {"error": "No images uploaded for this class"}, state
+    model_id = MODELS[model_name]
+    # Reset state if model settings changed
+    if state.get("model_id") and (state["model_id"] != model_id or state.get("pooling") != pooling):
+        state = _init_state()
+    state["model_id"] = model_id
+    state["pooling"] = pooling
+    imgs = _open_images_from_paths(files)
+    if not imgs:
+        return {"error": "Could not read uploaded images"}, state
+    embs = []
+    for im in imgs:
+        e, _, _ = _extract_core(im, model_id, pooling, want_overlay=False)
+        embs.append(e.astype(np.float32))
+    X = np.vstack(embs)
+    if class_name not in state["classes"]:
+        state["classes"][class_name] = {"embs": X, "count": X.shape[0]}
+    else:
+        old = state["classes"][class_name]["embs"]
+        new = np.concatenate([old, X], axis=0)
+        state["classes"][class_name]["embs"] = new
+        state["classes"][class_name]["count"] = new.shape[0]
+    return _summarize_state(state), state
+@spaces.GPU(duration=_gpu_duration_classify)
+def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
+    if image is None:
+        return {"error": "Upload a query image"}, {}, None
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available. Ensure ZeroGPU is selected.")
+    if not state or not state.get("classes"):
+        return {"error": "No classes have been added yet"}, {}, None
+    model_id = MODELS[model_name]
+    if state.get("model_id") != model_id or state.get("pooling") != pooling:
+        return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
+    # Compute query embedding
+    q, _, _ = _extract_core(image, model_id, pooling, want_overlay=False)
+    q = q.astype(np.float32)[None, :]
+    qn = _normalize_rows(q)  # [1, D]
+    # Build centroids per class
+    names = []
+    cents = []
+    for cname, bundle in state["classes"].items():
+        X = bundle["embs"].astype(np.float32)
+        Xn = _normalize_rows(X)
+        c = Xn.mean(axis=0, keepdims=True)  # centroid in cosine space
+        c = _normalize_rows(c)
+        names.append(cname)
+        cents.append(c)
+    C = np.vstack(cents)  # [K, D]
+    sims = (qn @ C.T).flatten()  # cosine similarity
+    # Stable softmax over a temperature
+    temp = 0.05
+    logits = sims / max(temp, 1e-6)
+    logits = logits - logits.max()
+    probs = np.exp(logits)
+    probs = probs / probs.sum()
+    order = np.argsort(-probs)[: max(1, min(top_k, len(names)))]
+    result_dict = {names[i]: float(probs[i]) for i in order}
+    # Full table for display
+    full_table = "<ol>" + "".join(
+        f"<li>{names[i]}  score {float(probs[i]):.4f}</li>" for i in order
+    ) + "</ol>"
+    # gr.Label expects a dict of class to score for visualization
+    return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
+def clear_classes(_state: Dict):
+    return _init_state(), _summarize_state(_init_state())
 # ---------------------------
 # UI
 # ---------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# DINOv3 - Embeddings, Similarity, Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
         url={[https://arxiv.org/abs/2508.10104}](https://arxiv.org/abs/2508.10104}),
         }
         ```  """)
+    # ------------- Single -------------
     with gr.Tab("Single"):
         with gr.Row():
             with gr.Column():
                 preview = gr.Textbox(label="Embedding head", max_lines=2)
                 meta = gr.JSON(label="Meta")
                 download = gr.File(label="embedding.npy")
         run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
+        # Optional examples if you add files under ./examples
+        ex_single = []
+        for p in sorted(glob.glob("examples/*.*"))[:6]:
+            ex_single.append([p, DEFAULT_MODEL, "CLS", False])
+        if ex_single:
+            gr.Examples(
+                label="Examples",
+                examples=ex_single,
+                inputs=[img, model_dd, pooling, overlay],
+            )
+    # ------------- Cosine Sim -------------
     with gr.Tab("Cosine Sim"):
         gr.Markdown("Upload multiple images. We compute a cosine similarity matrix on GPU and return a CSV.")
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
         pooling2 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
         go = gr.Button("Compute cosine on GPU")
+        table = gr.HTML(label="Cosine similarity")
         csv = gr.File(label="cosine_similarity_matrix.csv")
         def _preview(paths):
             if not paths:
                 return []
                 except Exception:
                     pass
             return imgs
         files_in.change(_preview, inputs=files_in, outputs=gallery_preview)
         go.click(batch_similarity, [files_in, model_dd2, pooling2], [table, csv])
+    # ------------- Image Classification -------------
+    with gr.Tab("Image Classification"):
+        st = gr.State(_init_state())
+        with gr.Row():
+            with gr.Column():
+                model_dd3 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
+                pooling3 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
+                gr.Markdown("Build your labeled set by adding a few images per class.")
+                class_name = gr.Textbox(label="Class name")
+                class_files = gr.Files(label="Upload images for this class", file_types=["image"], type="filepath", file_count="multiple")
+                add_btn = gr.Button("Add class on GPU")
+                clear_btn = gr.Button("Clear classes")
+                state_view = gr.JSON(label="Classifier state")
+            with gr.Column():
+                query_img = gr.Image(type="pil", label="Query image", height=360)
+                topk = gr.Slider(1, 10, value=3, step=1, label="Top K")
+                predict_btn = gr.Button("Predict on GPU")
+                predicted = gr.Label(num_top_classes=3, label="Prediction")
+                scores_html = gr.HTML(label="Scores")
+        add_btn.click(
+            add_class,
+            [class_name, class_files, model_dd3, pooling3, st],
+            [state_view, st],
+        )
+        clear_btn.click(
+            clear_classes,
+            [st],
+            [st, state_view],
+        )
+        predict_btn.click(
+            predict_class,
+            [query_img, model_dd3, pooling3, st, topk],
+            [gr.JSON(label="Info"), predicted, scores_html],
+        )
+        # Optional sample query examples from ./examples
+        ex_cls = []
+        for p in sorted(glob.glob("examples/classify_*.*"))[:8]:
+            ex_cls.append([p, topk.value if hasattr(topk, "value") else 3])
+        if ex_cls:
+            gr.Examples(
+                label="Query examples",
+                examples=ex_cls,
+                inputs=[query_img, topk],
+            )
 if __name__ == "__main__":
     app.queue().launch()