Spaces:

scyonggg
/

Video_action_recognition_on_MERL_shopping_dataset

Sleeping

App Files Files Community

scyonggg commited on Jul 8

Commit

7d8112d

1 Parent(s): 4fe3173

Initial commit

Browse files

Files changed (2) hide show

.gitattributes +1 -0
app.py +178 -122

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ckpts/best_val.pt filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import tempfile
 import threading
 import time
 from huggingface_hub import hf_hub_download
 from evals.video_classification_frozen.models import init_module
 from src.models.attentive_pooler import AttentiveClassifier
@@ -23,7 +24,135 @@ ACTION_LABELS = {
     4: "Inspect Shelf",
 }
-def process_video(video_path, encoder, classifier, device, frames_per_clip=16, resolution=256):
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     if fps == 0 or np.isnan(fps):
@@ -34,102 +163,36 @@ def process_video(video_path, encoder, classifier, device, frames_per_clip=16, r
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         out_path = tmpfile.name
     out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    frames = []
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame_resized = cv2.resize(frame, (resolution, resolution))
-        frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
-        frame_tensor = torch.from_numpy(frame_rgb).permute(2, 0, 1).float() / 255.0
-        frames.append(frame_tensor)
-        if len(frames) == frames_per_clip:
-            clip = torch.stack(frames)
-            clip = clip.permute(1, 0, 2, 3).unsqueeze(0).to(device)
-            with torch.no_grad():
-                features = encoder([[clip]])[0]
-                logits = classifier(features)
-                pred = logits.argmax(dim=1).item()
-                label = ACTION_LABELS.get(pred, str(pred))
-            for f in frames:
-                f_disp = (f.permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-                f_disp = cv2.cvtColor(f_disp, cv2.COLOR_RGB2BGR)
-                f_disp = cv2.resize(f_disp, (width, height))
-                overlay = f_disp.copy()
-                text = label
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                font_scale = 1.2
-                thickness = 2
-                text_color = (255, 255, 255)  # white
-                outline_color = (0, 0, 0)    # black
-                alpha = 0.6  # transparency for rectangle
-                (text_w, text_h), baseline = cv2.getTextSize(text, font, font_scale, thickness)
-                rect_w, rect_h = text_w + 40, text_h + 30
-                center_x = width // 2
-                rect_x1 = center_x - rect_w // 2
-                rect_y1 = height - rect_h - 30
-                rect_x2 = rect_x1 + rect_w
-                rect_y2 = rect_y1 + rect_h
-                radius = 15
-                cv2.rectangle(overlay, (rect_x1 + radius, rect_y1), (rect_x2 - radius, rect_y2), (0,0,0), -1)
-                cv2.rectangle(overlay, (rect_x1, rect_y1 + radius), (rect_x2, rect_y2 - radius), (0,0,0), -1)
-                cv2.circle(overlay, (rect_x1 + radius, rect_y1 + radius), radius, (0,0,0), -1)
-                cv2.circle(overlay, (rect_x2 - radius, rect_y1 + radius), radius, (0,0,0), -1)
-                cv2.circle(overlay, (rect_x1 + radius, rect_y2 - radius), radius, (0,0,0), -1)
-                cv2.circle(overlay, (rect_x2 - radius, rect_y2 - radius), radius, (0,0,0), -1)
-                cv2.addWeighted(overlay, alpha, f_disp, 1 - alpha, 0, f_disp)
-                text_x = center_x - text_w // 2
-                text_y = rect_y1 + rect_h // 2 + text_h // 2 - 5
-                cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, outline_color, thickness + 2, cv2.LINE_AA)
-                cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)
-                out.write(f_disp)
-            frames = []
-    for f in frames:
-        f_disp = (f.permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-        f_disp = cv2.cvtColor(f_disp, cv2.COLOR_RGB2BGR)
-        f_disp = cv2.resize(f_disp, (width, height))
-        overlay = f_disp.copy()
-        text = "-"
-        font = cv2.FONT_HERSHEY_SIMPLEX
-        font_scale = 1.2
-        thickness = 2
-        text_color = (255, 255, 255)
-        outline_color = (0, 0, 0)
-        alpha = 0.6
-        (text_w, text_h), baseline = cv2.getTextSize(text, font, font_scale, thickness)
-        rect_w, rect_h = text_w + 40, text_h + 30
-        center_x = width // 2
-        rect_x1 = center_x - rect_w // 2
-        rect_y1 = height - rect_h - 30
-        rect_x2 = rect_x1 + rect_w
-        rect_y2 = rect_y1 + rect_h
-        radius = 15
-        cv2.rectangle(overlay, (rect_x1 + radius, rect_y1), (rect_x2 - radius, rect_y2), (0,0,0), -1)
-        cv2.rectangle(overlay, (rect_x1, rect_y1 + radius), (rect_x2, rect_y2 - radius), (0,0,0), -1)
-        cv2.circle(overlay, (rect_x1 + radius, rect_y1 + radius), radius, (0,0,0), -1)
-        cv2.circle(overlay, (rect_x2 - radius, rect_y1 + radius), radius, (0,0,0), -1)
-        cv2.circle(overlay, (rect_x1 + radius, rect_y2 - radius), radius, (0,0,0), -1)
-        cv2.circle(overlay, (rect_x2 - radius, rect_y2 - radius), radius, (0,0,0), -1)
-        cv2.addWeighted(overlay, alpha, f_disp, 1 - alpha, 0, f_disp)
-        text_x = center_x - text_w // 2
-        text_y = rect_y1 + rect_h // 2 + text_h // 2 - 5
-        cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, outline_color, thickness + 2, cv2.LINE_AA)
-        cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)
-        out.write(f_disp)
-    cap.release()
-    out.release()
-    return out_path
-def gradio_infer(video, encoder, classifier, device):
-    processed_path = process_video(video, encoder, classifier, device)
-    def cleanup(path):
-        time.sleep(60)
-        try:
-            Path(path).unlink()
-        except Exception:
-            pass
-    threading.Thread(target=cleanup, args=(processed_path,), daemon=True).start()
-    return processed_path
 def load_config(config_path):
     with open(config_path, 'r') as f:
@@ -138,39 +201,32 @@ def load_config(config_path):
 def load_models():
     # Paths are relative to HuggingFace directory
     config_path = "configs/merl.yaml"
-    encoder_ckpt = hf_hub_download(
-        repo_id="facebook/vjepa2-vitl-fpc64-256",
-        filename="model.safetensors",
-        repo_type="model"
-    )
     classifier_ckpt = "ckpts/best_val.pt"
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     config = load_config(config_path)
-    model_kwargs = config['model_kwargs']['pretrain_kwargs']
-    wrapper_kwargs = config['model_kwargs'].get('wrapper_kwargs', {})
-    module_name = config['model_kwargs']['module_name']
     frames_per_clip = config['experiment']['data']['frames_per_clip']
     resolution = config['experiment']['data']['resolution']
     num_heads = config['experiment']['classifier']['num_heads']
     depth = config['experiment']['classifier']['num_probe_blocks']
     num_classes = config['experiment']['data']['num_classes']
-    encoder = init_module(
-        module_name=module_name,
-        frames_per_clip=frames_per_clip,
-        resolution=resolution,
-        checkpoint=encoder_ckpt,
-        model_kwargs=model_kwargs,
-        wrapper_kwargs=wrapper_kwargs,
-        device=device,
-    )
     encoder.eval()
     encoder.to(device)
     classifier_ckpt_data = torch.load(classifier_ckpt, map_location='cpu')
     state_dict = classifier_ckpt_data['classifier']
     if any(k.startswith('module.') for k in state_dict.keys()):
         state_dict = {k.replace('module.', '', 1): v for k, v in state_dict.items()}
     classifier = AttentiveClassifier(
-        embed_dim=encoder.embed_dim,
         num_heads=num_heads,
         depth=depth,
         num_classes=num_classes
@@ -178,19 +234,19 @@ def load_models():
     classifier.load_state_dict(state_dict, strict=True)
     classifier.eval()
     classifier.to(device)
-    return encoder, classifier, device, frames_per_clip, resolution
-encoder, classifier, device, frames_per_clip, resolution = load_models()
-def gradio_wrapper(video):
-    return gradio_infer(video, encoder, classifier, device)
 demo = gr.Interface(
     fn=gradio_wrapper,
     inputs=gr.Video(label="Upload Video"),
     outputs=gr.Video(label="Processed Video with Action Labels"),
     title="V-JEPA2 Video Action Recognition Demo",
-    description="Upload a video or use your webcam. The model will recognize and localize actions in real-time.",
     allow_flagging="never",
     live=False,
 )

 import threading
 import time
 from huggingface_hub import hf_hub_download
+from transformers import AutoModel, AutoVideoProcessor
 from evals.video_classification_frozen.models import init_module
 from src.models.attentive_pooler import AttentiveClassifier
     4: "Inspect Shelf",
 }
+VIT_EMBED_DIMS = {
+    "vit_synthetic": 1,
+    "vit_tiny": 192,
+    "vit_small": 384,
+    "vit_base": 768,
+    "vit_large": 1024,
+    "vit_huge": 1280,
+    "vit_giant": 1408,
+    "vit_gigantic": 1664,
+}
+# Support MPS (Apple Silicon) in addition to CUDA/CPU
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+elif getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
+    device = torch.device('mps')
+else:
+    device = torch.device('cpu')
+print(f'Using device {device}')
+def gradio_infer(video_path, encoder, classifier, hf_transform, frames_per_clip, resolution):
+    with torch.inference_mode():
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps == 0 or np.isnan(fps):
+            fps = 25
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+            out_path = tmpfile.name
+        out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_resized = cv2.resize(frame, (resolution, resolution))
+            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
+            frame_tensor = torch.from_numpy(frame_rgb).permute(2, 0, 1).float()  # C H W
+            frames.append(frame_tensor)
+            if len(frames) == frames_per_clip:
+                clip = torch.stack(frames)  # T C H W
+                clip = hf_transform(clip, return_tensors="pt")["pixel_values_videos"].to(device)
+                with torch.no_grad():
+                    features = encoder.get_vision_features(clip)
+                    logits = classifier(features)
+                    pred = logits.argmax(dim=1).item()
+                    label = ACTION_LABELS.get(pred, str(pred))
+                for f in frames:
+                    f_disp = (f.permute(1,2,0).cpu().numpy()).astype(np.uint8)
+                    f_disp = cv2.cvtColor(f_disp, cv2.COLOR_RGB2BGR)
+                    f_disp = cv2.resize(f_disp, (width, height))
+                    overlay = f_disp.copy()
+                    text = label
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    font_scale = 1.2
+                    thickness = 2
+                    text_color = (255, 255, 255)  # white
+                    outline_color = (0, 0, 0)    # black
+                    alpha = 0.6  # transparency for rectangle
+                    (text_w, text_h), baseline = cv2.getTextSize(text, font, font_scale, thickness)
+                    rect_w, rect_h = text_w + 40, text_h + 30
+                    center_x = width // 2
+                    rect_x1 = center_x - rect_w // 2
+                    rect_y1 = height - rect_h - 30
+                    rect_x2 = rect_x1 + rect_w
+                    rect_y2 = rect_y1 + rect_h
+                    radius = 15
+                    cv2.rectangle(overlay, (rect_x1 + radius, rect_y1), (rect_x2 - radius, rect_y2), (0,0,0), -1)
+                    cv2.rectangle(overlay, (rect_x1, rect_y1 + radius), (rect_x2, rect_y2 - radius), (0,0,0), -1)
+                    cv2.circle(overlay, (rect_x1 + radius, rect_y1 + radius), radius, (0,0,0), -1)
+                    cv2.circle(overlay, (rect_x2 - radius, rect_y1 + radius), radius, (0,0,0), -1)
+                    cv2.circle(overlay, (rect_x1 + radius, rect_y2 - radius), radius, (0,0,0), -1)
+                    cv2.circle(overlay, (rect_x2 - radius, rect_y2 - radius), radius, (0,0,0), -1)
+                    cv2.addWeighted(overlay, alpha, f_disp, 1 - alpha, 0, f_disp)
+                    text_x = center_x - text_w // 2
+                    text_y = rect_y1 + rect_h // 2 + text_h // 2 - 5
+                    cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, outline_color, thickness + 2, cv2.LINE_AA)
+                    cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)
+                    out.write(f_disp)
+                frames = []
+        for f in frames:
+            f_disp = (f.permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
+            f_disp = cv2.cvtColor(f_disp, cv2.COLOR_RGB2BGR)
+            f_disp = cv2.resize(f_disp, (width, height))
+            overlay = f_disp.copy()
+            text = "-"
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            font_scale = 1.2
+            thickness = 2
+            text_color = (255, 255, 255)
+            outline_color = (0, 0, 0)
+            alpha = 0.6
+            (text_w, text_h), baseline = cv2.getTextSize(text, font, font_scale, thickness)
+            rect_w, rect_h = text_w + 40, text_h + 30
+            center_x = width // 2
+            rect_x1 = center_x - rect_w // 2
+            rect_y1 = height - rect_h - 30
+            rect_x2 = rect_x1 + rect_w
+            rect_y2 = rect_y1 + rect_h
+            radius = 15
+            cv2.rectangle(overlay, (rect_x1 + radius, rect_y1), (rect_x2 - radius, rect_y2), (0,0,0), -1)
+            cv2.rectangle(overlay, (rect_x1, rect_y1 + radius), (rect_x2, rect_y2 - radius), (0,0,0), -1)
+            cv2.circle(overlay, (rect_x1 + radius, rect_y1 + radius), radius, (0,0,0), -1)
+            cv2.circle(overlay, (rect_x2 - radius, rect_y1 + radius), radius, (0,0,0), -1)
+            cv2.circle(overlay, (rect_x1 + radius, rect_y2 - radius), radius, (0,0,0), -1)
+            cv2.circle(overlay, (rect_x2 - radius, rect_y2 - radius), radius, (0,0,0), -1)
+            cv2.addWeighted(overlay, alpha, f_disp, 1 - alpha, 0, f_disp)
+            text_x = center_x - text_w // 2
+            text_y = rect_y1 + rect_h // 2 + text_h // 2 - 5
+            cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, outline_color, thickness + 2, cv2.LINE_AA)
+            cv2.putText(f_disp, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)
+            out.write(f_disp)
+        cap.release()
+        out.release()
+        def cleanup(path):
+            time.sleep(60)
+            try:
+                Path(path).unlink()
+            except Exception:
+                pass
+        threading.Thread(target=cleanup, args=(out_path,), daemon=True).start()
+        return out_path
+def process_video(video_path):
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     if fps == 0 or np.isnan(fps):
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         out_path = tmpfile.name
     out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+    return
+# def forward_vjepa_video(encoder, classifier, hf_transform, video_path):
+#     with torch.inference_mode():
+#         cap = cv2.VideoCapture(video_path)
+#         fps = cap.get(cv2.CAP_PROP_FPS)
+#         if fps == 0 or np.isnan(fps):
+#             fps = 25
+#         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+#         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+#         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+#         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+#             out_path = tmpfile.name
+#         out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+#         frames = []
+#         while True:
+#             ret, frame = cap.read()
+#             if not ret:
+#                 break
+#             frame = torch.from_numpy(frame).
+# def forward_vjepa_video(model, hf_transform):
+#     # Run a sample inference with VJEPA
+#     with torch.inference_mode():
+#         # Read and pre-process the image
+#         video = get_video()  # T x H x W x C
+#         video = torch.from_numpy(video).permute(0, 3, 1, 2)  # T x C x H x W
+#         x_hf = hf_transform(video, return_tensors="pt")["pixel_values_videos"].to(device)
+#         out_patch_features_hf = model.get_vision_features(x_hf)
+#     return out_patch_features_hf
 def load_config(config_path):
     with open(config_path, 'r') as f:
 def load_models():
     # Paths are relative to HuggingFace directory
     config_path = "configs/merl.yaml"
+    hf_model_name = "facebook/vjepa2-vitl-fpc64-256"
     classifier_ckpt = "ckpts/best_val.pt"
     config = load_config(config_path)
     frames_per_clip = config['experiment']['data']['frames_per_clip']
     resolution = config['experiment']['data']['resolution']
     num_heads = config['experiment']['classifier']['num_heads']
     depth = config['experiment']['classifier']['num_probe_blocks']
     num_classes = config['experiment']['data']['num_classes']
+    # Build HuggingFace preprocessing transform
+    hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)
+    img_size = hf_transform.crop_size["height"]
+    # Initialize the HuggingFace model, load pretrained weights
+    encoder = AutoModel.from_pretrained(hf_model_name)
     encoder.eval()
     encoder.to(device)
     classifier_ckpt_data = torch.load(classifier_ckpt, map_location='cpu')
     state_dict = classifier_ckpt_data['classifier']
     if any(k.startswith('module.') for k in state_dict.keys()):
         state_dict = {k.replace('module.', '', 1): v for k, v in state_dict.items()}
     classifier = AttentiveClassifier(
+        embed_dim=VIT_EMBED_DIMS['vit_large'],
         num_heads=num_heads,
         depth=depth,
         num_classes=num_classes
     classifier.load_state_dict(state_dict, strict=True)
     classifier.eval()
     classifier.to(device)
+    return encoder, classifier, hf_transform, frames_per_clip, resolution
+def gradio_wrapper(video_path):
+    encoder, classifier, hf_transform, frames_per_clip, resolution = load_models()
+    return gradio_infer(video_path, encoder, classifier, hf_transform, frames_per_clip, resolution)
 demo = gr.Interface(
     fn=gradio_wrapper,
     inputs=gr.Video(label="Upload Video"),
     outputs=gr.Video(label="Processed Video with Action Labels"),
     title="V-JEPA2 Video Action Recognition Demo",
+    description="Upload a video or use your webcam. The model will recognize and localize actions in real-time. \
+        Recognizable actions: Reach To Shelf, Retract From Shelf, Hand In Shelf, Inspect Product, Inspect Shelf",
     allow_flagging="never",
     live=False,
 )