Model Card for Model ID
Model Details
Model Description
This model is based on QWen2-VL-Instruct and is developed through the MP3L pipeline.
Training Details
Training Data
train_DPO.json, train_KTO.json
Quick Start
Below is a simple example that can be deployed locally and demonstrates how to use this model for metaphor detection:
import os
import gc
import torch
import io
import base64
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
MODEL_DIR_DEFAULT = "../MP3L/"
_loaded = {"model": None, "processor": None}
def unload_model():
if _loaded["model"] is None:
return "No model currently loaded."
try:
del _loaded["model"]
del _loaded["processor"]
except Exception:
pass
_loaded["model"] = None
_loaded["processor"] = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
return "✅ Model unloaded and memory cleaned up."
def _parse_gpu_ids(gpu_ids_str: str):
"""
Input example: "0" / "0,1" / " 1 , 3 "
Returns: [0,1] / None (meaning no restriction)
"""
s = (gpu_ids_str or "").strip()
if not s:
return None
ids = []
for p in s.replace(" ", "").split(","):
if p == "":
continue
ids.append(int(p))
ids = sorted(set(ids))
return ids
def load_model(model_dir, device_pref, dtype_pref, use_flash_attn2, gpu_ids_str, gpu_max_memory, cpu_max_memory):
"""
- device_pref = auto/cuda/cpu
- gpu_ids_str: e.g., "0,1" restricts to those GPUs (via max_memory)
- gpu_max_memory: e.g., "20GiB"
- cpu_max_memory: e.g., "64GiB"
"""
unload_model()
attn_impl = "flash_attention_2" if use_flash_attn2 else None
dtype = "auto" if dtype_pref == "auto" else getattr(torch, dtype_pref)
gpu_ids = _parse_gpu_ids(gpu_ids_str)
# Key: limit GPU usage via max_memory
# transformers will automatically split the model based on devices in max_memory
max_memory = None
device_map = None
if device_pref == "cpu" or not torch.cuda.is_available():
device_map = None
max_memory = None
else:
# CUDA available
if device_pref == "auto":
device_map = "auto"
else:
# device_pref == "cuda": still use "auto" to allow cpu offload if needed
device_map = "auto"
if gpu_ids is not None:
max_memory = {i: gpu_max_memory for i in gpu_ids}
# Provide a CPU fallback (to avoid OOM if GPUs are full)
if cpu_max_memory and cpu_max_memory.strip():
max_memory["cpu"] = cpu_max_memory.strip()
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_dir,
torch_dtype=dtype,
device_map=device_map,
max_memory=max_memory,
attn_implementation=attn_impl,
)
model.eval()
processor = AutoProcessor.from_pretrained(model_dir)
_loaded["model"] = model
_loaded["processor"] = processor
msg = f"✅ Model loaded: {model_dir}\n- device_map={device_map}"
if max_memory is not None:
msg += f"\n- max_memory={max_memory}"
if gpu_ids is not None:
msg += f"\n- Restricted GPU IDs={gpu_ids}"
return msg
def load_image_from_path(path: str):
p = (path or "").strip()
if not p:
return None, "❌ Please enter an image path."
if not os.path.isfile(p):
return None, f"❌ File does not exist: {p}"
try:
img = Image.open(p).convert("RGB")
return img, f"✅ Image loaded: {p}"
except Exception as e:
return None, f"❌ Failed to load image: {repr(e)}"
def _md_linebreak(s: str) -> str:
"""Make line breaks render properly in Markdown."""
s = (s or "").replace("\r\n", "\n")
return s.replace("\n", " \n") # two spaces + newline = Markdown forced line break
def _pil_to_data_uri(img: Image.Image, max_side: int = 512) -> str:
"""PIL -> data:image/png;base64,... (resize to avoid huge strings)."""
if img is None:
return ""
im = img.convert("RGB")
w, h = im.size
scale = max(w, h) / float(max_side)
if scale > 1:
im = im.resize((int(w / scale), int(h / scale)))
buf = io.BytesIO()
im.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
return f"data:image/png;base64,{b64}"
def chat_generate(ui_msgs, qwen_msgs, user_text, user_image, max_new_tokens, temperature, top_p):
"""
ui_msgs : messages for gr.Chatbot display (content as markdown string, possibly containing base64 images)
qwen_msgs : messages for Qwen2-VL inference (content is a list of dicts with type:text/image, images as PIL)
"""
if _loaded["model"] is None:
return ui_msgs, qwen_msgs, None, "❌ Model not loaded. Please load the model first."
if (not user_text or not user_text.strip()) and user_image is None:
return ui_msgs, qwen_msgs, None, "❌ Please provide at least text or an image."
model = _loaded["model"]
processor = _loaded["processor"]
# -------- UI display: embed image in markdown with visible line breaks --------
ui_user_parts = []
if user_text and user_text.strip():
ui_user_parts.append(_md_linebreak(user_text.strip()))
if user_image is not None:
data_uri = _pil_to_data_uri(user_image, max_side=512)
ui_user_parts.append(f"")
ui_user_display = "\n\n".join(ui_user_parts) if ui_user_parts else "[Image]"
ui_msgs = list(ui_msgs or [])
ui_msgs.append({"role": "user", "content": ui_user_display})
# -------- Model inference: preserve raw structure (don't put base64 into model context) --------
qwen_msgs = list(qwen_msgs or [])
user_content = []
if user_image is not None:
user_content.append({"type": "image", "image": user_image})
if user_text and user_text.strip():
user_content.append({"type": "text", "text": user_text.strip()})
qwen_msgs.append({"role": "user", "content": user_content})
# Assemble inputs
text = processor.apply_chat_template(qwen_msgs, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(qwen_msgs)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
# Move to the device where the model resides (supports user-specified GPU)
try:
target_device = next(model.parameters()).device
except StopIteration:
target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = inputs.to(target_device)
gen_kwargs = {
"max_new_tokens": int(max_new_tokens),
"do_sample": float(temperature) > 0,
"temperature": float(temperature),
"top_p": float(top_p),
}
with torch.inference_mode():
out = model.generate(**inputs, **gen_kwargs)
out_trim = [o[len(i):] for i, o in zip(inputs.input_ids, out)]
resp = processor.batch_decode(out_trim, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# Update both histories
ui_msgs.append({"role": "assistant", "content": _md_linebreak(resp)})
qwen_msgs.append({"role": "assistant", "content": [{"type": "text", "text": resp}]})
# Return: ui_msgs for Chatbot; clear input image (img output None), status message
return ui_msgs, qwen_msgs, None, "✅ Response generated (image cleared but remains in chat)."
def clear_chat():
return [], [], None, "✅ Chat cleared."
with gr.Blocks(title="Qwen2-VL merged local inference WebUI") as demo:
gr.Markdown("## Qwen2-VL (merged checkpoint) local deployment and inference")
model_dir = gr.Textbox(label="Model directory", value=MODEL_DIR_DEFAULT)
with gr.Row():
device_pref = gr.Dropdown(choices=["auto", "cuda", "cpu"], value="auto", label="Device")
dtype_pref = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto", label="dtype")
use_flash_attn2 = gr.Checkbox(value=False, label="flash_attention_2 (enable if environment supports)")
with gr.Row():
gpu_ids_str = gr.Textbox(
label="Restrict to GPU IDs (optional)",
placeholder='e.g., "0" or "0,1"; leave empty for no restriction',
value=""
)
gpu_max_memory = gr.Textbox(
label="Max memory per GPU (max_memory)",
placeholder='e.g., "20GiB"',
value="20GiB"
)
cpu_max_memory = gr.Textbox(
label="CPU fallback max_memory (optional)",
placeholder='e.g., "64GiB"; leave empty to skip',
value="64GiB"
)
with gr.Row():
btn_load = gr.Button("Load model", variant="primary")
btn_unload = gr.Button("Unload model", variant="stop")
btn_clear = gr.Button("Clear chat")
status = gr.Markdown("(Status display)")
chatbot = gr.Chatbot(label="Chat", height=520, type="messages")
state_ui = gr.State([]) # for Chatbot display
state_qwen = gr.State([]) # for model inference context (with images)
with gr.Row():
with gr.Column():
img_path = gr.Textbox(label="Load image from path (optional)", placeholder="/abs/path/to/image.jpg")
btn_load_img = gr.Button("Load image to panel")
img = gr.Image(type="pil", label="Uploaded image (optional)", height=300)
with gr.Column():
user_text = gr.Textbox(lines=6, label="Input text (optional)")
with gr.Row():
max_new_tokens = gr.Slider(1, 2048, value=256, step=1, label="max_new_tokens")
temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
btn_send = gr.Button("Send", variant="primary")
# Load image button: read path into img component
btn_load_img.click(load_image_from_path, [img_path], [img, status])
# Load/unload model
btn_load.click(
load_model,
[model_dir, device_pref, dtype_pref, use_flash_attn2, gpu_ids_str, gpu_max_memory, cpu_max_memory],
status
)
btn_unload.click(unload_model, [], status)
# Clear chat
btn_clear.click(clear_chat, [], [chatbot, state_qwen, img, status]).then(lambda: [], [], state_ui)
btn_send.click(
chat_generate,
[state_ui, state_qwen, user_text, img, max_new_tokens, temperature, top_p],
[chatbot, state_qwen, img, status],
).then(lambda x: x, chatbot, state_ui)
demo.queue(max_size=32).launch(
server_name="0.0.0.0",
server_port=None, # automatically find a free port
show_api=False
)
Suggestions using the following prompt words:
**Englsih Version:**
Input: An image-text pair: <image> <text>.
Task: Perform a comprehensive metaphor analysis.
Process: Internally evaluate the input for metaphorical content. Do not output your reasoning steps, only the final assessment.
Output Requirements:
1. Metaphor Presence: State "Yes" or "No".
2. If "Yes":
- Emotional Perspective: [Explanation]
- Intentional Perspective: [Explanation]
- Offensive Perspective: [Explanation]
- Mapping Process: Describe the 'source' and 'target' domains of the metaphor.
3. If "No":
- Reason: Provide a concise explanation for the absence of metaphor.
**中文版本:**
输入:一个图文对:<image> <text>。
任务:执行全面的隐喻分析。
过程:请在内部对输入内容进行隐喻评估。无需输出推理步骤,仅提供最终评估结果。
输出要求:
1. 隐喻存在性:回答“是”或“否”。
2. 如果为“是”:
- 情感视角:[解释]
- 意图视角:[解释]
- 冒犯视角:[解释]
- 映射过程:描述隐喻的“源域”和“目标域”。
3. 如果为“否”:
- 原因:简要说明不存在隐喻的原因。
In order to effectively evaluate the MAR algorithm and the quality of the constructed dataset, we randomly sampled 200 pieces of data from the MET-Meme 2.0 dataset and manually annotated them. The results showed that, compared with VLMs such as GPT-5.1-Thinking, Kimi K2.5, Claude-Opus-4-6-Thinking, and Gemini-3-Flash-Preview, the dataset constructed by the MAR algorithm has the greatest similarity to the human-annotated dataset, and is 17% better than the best VLMs used as the Judge model (Gemini-3-Flash-Preview). As shown in the figure below.
- Downloads last month
- 8
