OmniSapiens BAM β Sarcasm Detection
Fine-tuned Qwen2.5-Omni-7B for multimodal sarcasm detection on the MUStARD/MMSD benchmark. Uses LoRA adapters merged into the backbone and a lightweight classification head.
Benchmark
Evaluated on keentomato/human_behavior_atlas.
Usage
Installation
pip install transformers torch huggingface_hub
Classification
import json, torch
from huggingface_hub import hf_hub_download
from transformers import Qwen2_5OmniThinkerForConditionalGeneration, AutoProcessor
MODEL_ID = "keentomato/omnisapiens_bam_sarcasm_detection"
# 1. Load backbone and processor
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
MODEL_ID, torch_dtype=torch.float16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
# 2. Load classification heads and label scheme
heads_path = hf_hub_download(MODEL_ID, "heads.bin")
label_path = hf_hub_download(MODEL_ID, "label_scheme.json")
heads_sd = torch.load(heads_path, map_location="cpu")
with open(label_path) as f:
label_scheme = json.load(f)
# 3. Reconstruct domain heads
global_classes = label_scheme["meta"]["global_classes"] # {domain: [{index, label}, ...]}
hidden_size = model.config.hidden_size
domain_names = list(global_classes.keys())
domain_heads = torch.nn.ModuleList([
torch.nn.Linear(hidden_size, len(global_classes[d])) for d in domain_names
])
domain_heads.load_state_dict({k.replace("heads.", ""): v for k, v in heads_sd.items()})
domain_heads.eval().to(model.device).to(torch.float16)
domain_to_id = {d: i for i, d in enumerate(domain_names)}
# 4. Prepare multimodal inputs
# video_tensor: [T, C, H, W] tensor or list of PIL images
# audio_waveform: 1-D numpy array / tensor at 16 kHz
domain = "sarcasm"
messages = [{"role": "user", "content": [
{"type": "video"},
{"type": "audio"},
{"type": "text", "text": "Classify the human behavior expressed."},
]}]
text = processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
inputs = processor(text=[text], videos=[video_tensor], audio=[audio_waveform], return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# 5. Forward pass β pool penultimate hidden layer, route through domain head
with torch.no_grad():
out = model(**inputs, output_hidden_states=True, use_cache=False)
h = out.hidden_states[-2] # [B, T, H]
mask = inputs["attention_mask"].unsqueeze(-1).float()
pooled = (h * mask).sum(1) / mask.sum(1) # [B, H]
logits = domain_heads[domain_to_id[domain]](pooled.float()) # [B, K_d]
pred_idx = logits.argmax(dim=-1).item()
label_name = global_classes[domain][pred_idx]["label"]
print(f"Predicted {domain}: {label_name}")
Behavioral Descriptors (BAM Adapters)
As adapters.bin is present in the repo, the model supports side-channel
behavioral descriptors extracted from OpenPose (video) and OpenSmile (audio).
These replace the raw video/audio inputs to the backbone with pre-computed
behavioral feature vectors that are injected via lightweight MLP adapters.
Video β OpenPose keypoints
OpenPose produces a dict per clip with keys pose, face, left_hand, right_hand,
each a [T, K, 2or3] tensor (T frames, K keypoints, x/y/conf).
def prepare_video_feats(openpose_dict, temporal_mode="meanstd"):
"""OpenPose dict β pooled feature vector [D_v_pooled]."""
parts = []
for key in ("pose", "face", "left_hand", "right_hand"):
t = openpose_dict.get(key) # [T, K, 2or3]
if t is None: continue
t = torch.as_tensor(t).float()[..., :2] # drop confidence, keep x/y
parts.append(t.reshape(t.shape[0], -1)) # [T, K*2]
seq = torch.cat(parts, dim=-1).float() # [T, D_v]
if temporal_mode == "meanstd":
return torch.cat([seq.mean(0), seq.std(0)]) # [D_v*2]
return seq.mean(0) # [D_v]
video_feats = prepare_video_feats(openpose_dict).unsqueeze(0) # [1, D_v_pooled]
Audio β OpenSmile features
OpenSmile produces a dict with key features β [T, D_a] or [D_a].
def prepare_audio_feats(opensmile_dict):
"""OpenSmile dict β L2-normalised feature vector [D_a]."""
x = torch.as_tensor(opensmile_dict["features"]).float()
if x.ndim == 2: x = x.squeeze(0) # [D_a] (single frame assumed)
return x / x.norm(p=2).clamp_min(1e-6)
audio_feats = prepare_audio_feats(opensmile_dict).unsqueeze(0) # [1, D_a]
Loading and applying the adapters
import torch, torch.nn as nn
from huggingface_hub import hf_hub_download
adapters_sd = torch.load(hf_hub_download(MODEL_ID, "adapters.bin"), map_location="cpu")
# Infer architecture from saved weight shapes β no config needed
def _make_adapter(prefix, sd):
w0 = sd[f"{prefix}.mlp.0.weight"] # [hidden, feat_dim]
w2 = sd[f"{prefix}.mlp.2.weight"] # [out_dim, hidden]
feat_dim, hidden, out_dim = w0.shape[1], w0.shape[0], w2.shape[0]
mlp = nn.Sequential(nn.Linear(feat_dim, hidden), nn.ReLU(), nn.Linear(hidden, out_dim))
alpha = nn.Parameter(sd[f"{prefix}.alpha"])
class _Adapter(nn.Module):
def __init__(self): super().__init__(); self.mlp = mlp; self.alpha = alpha
def forward(self, x): return self.mlp(x) * self.alpha
m = _Adapter()
m.load_state_dict({k[len(prefix)+1:]: v for k, v in sd.items() if k.startswith(prefix)}, strict=False)
return m.eval()
video_adapter = _make_adapter("video_adapter", adapters_sd).to(model.device).half()
audio_adapter = _make_adapter("audio_adapter", adapters_sd).to(model.device).half()
# Augment pooled repr with BAM deltas before the classification head
with torch.no_grad():
out = model(**inputs, output_hidden_states=True, use_cache=False)
h = out.hidden_states[-2]
mask = inputs["attention_mask"].unsqueeze(-1).float()
pooled = (h * mask).sum(1) / mask.sum(1) # [B, H]
pooled = pooled + video_adapter(video_feats.to(model.device).half())
pooled = pooled + audio_adapter(audio_feats.to(model.device).half())
logits = domain_heads[domain_to_id[domain]](pooled.float())
pred_idx = logits.argmax(dim=-1).item()
label_name = global_classes[domain][pred_idx]["label"]
print(f"Predicted {domain}: {label_name}")
- Downloads last month
- 20