Model Card for Model ID

Model Details

Model Description

This model is based on QWen2-VL-Instruct and is developed through the MP3L pipeline.

Training Details

Training Data

train_DPO.json, train_KTO.json

Quick Start

Below is a simple example that can be deployed locally and demonstrates how to use this model for metaphor detection:

import os
import gc
import torch
import io
import base64
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info

MODEL_DIR_DEFAULT = "../MP3L/"

_loaded = {"model": None, "processor": None}

def unload_model():
    if _loaded["model"] is None:
        return "No model currently loaded."
    try:
        del _loaded["model"]
        del _loaded["processor"]
    except Exception:
        pass
    _loaded["model"] = None
    _loaded["processor"] = None
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    return "✅ Model unloaded and memory cleaned up."


def _parse_gpu_ids(gpu_ids_str: str):
    """
    Input example: "0" / "0,1" / " 1 , 3 "
    Returns: [0,1] / None (meaning no restriction)
    """
    s = (gpu_ids_str or "").strip()
    if not s:
        return None
    ids = []
    for p in s.replace(" ", "").split(","):
        if p == "":
            continue
        ids.append(int(p))
    ids = sorted(set(ids))
    return ids


def load_model(model_dir, device_pref, dtype_pref, use_flash_attn2, gpu_ids_str, gpu_max_memory, cpu_max_memory):
    """
    - device_pref = auto/cuda/cpu
    - gpu_ids_str: e.g., "0,1" restricts to those GPUs (via max_memory)
    - gpu_max_memory: e.g., "20GiB"
    - cpu_max_memory: e.g., "64GiB"
    """
    unload_model()

    attn_impl = "flash_attention_2" if use_flash_attn2 else None
    dtype = "auto" if dtype_pref == "auto" else getattr(torch, dtype_pref)

    gpu_ids = _parse_gpu_ids(gpu_ids_str)

    # Key: limit GPU usage via max_memory
    # transformers will automatically split the model based on devices in max_memory
    max_memory = None
    device_map = None

    if device_pref == "cpu" or not torch.cuda.is_available():
        device_map = None
        max_memory = None
    else:
        # CUDA available
        if device_pref == "auto":
            device_map = "auto"
        else:
            # device_pref == "cuda": still use "auto" to allow cpu offload if needed
            device_map = "auto"

        if gpu_ids is not None:
            max_memory = {i: gpu_max_memory for i in gpu_ids}
            # Provide a CPU fallback (to avoid OOM if GPUs are full)
            if cpu_max_memory and cpu_max_memory.strip():
                max_memory["cpu"] = cpu_max_memory.strip()

    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_dir,
        torch_dtype=dtype,
        device_map=device_map,
        max_memory=max_memory,
        attn_implementation=attn_impl,
    )
    model.eval()
    processor = AutoProcessor.from_pretrained(model_dir)

    _loaded["model"] = model
    _loaded["processor"] = processor

    msg = f"✅ Model loaded: {model_dir}\n- device_map={device_map}"
    if max_memory is not None:
        msg += f"\n- max_memory={max_memory}"
    if gpu_ids is not None:
        msg += f"\n- Restricted GPU IDs={gpu_ids}"
    return msg


def load_image_from_path(path: str):
    p = (path or "").strip()
    if not p:
        return None, "❌ Please enter an image path."
    if not os.path.isfile(p):
        return None, f"❌ File does not exist: {p}"
    try:
        img = Image.open(p).convert("RGB")
        return img, f"✅ Image loaded: {p}"
    except Exception as e:
        return None, f"❌ Failed to load image: {repr(e)}"

def _md_linebreak(s: str) -> str:
    """Make line breaks render properly in Markdown."""
    s = (s or "").replace("\r\n", "\n")
    return s.replace("\n", "  \n")  # two spaces + newline = Markdown forced line break

def _pil_to_data_uri(img: Image.Image, max_side: int = 512) -> str:
    """PIL -> data:image/png;base64,... (resize to avoid huge strings)."""
    if img is None:
        return ""
    im = img.convert("RGB")
    w, h = im.size
    scale = max(w, h) / float(max_side)
    if scale > 1:
        im = im.resize((int(w / scale), int(h / scale)))
    buf = io.BytesIO()
    im.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{b64}"

def chat_generate(ui_msgs, qwen_msgs, user_text, user_image, max_new_tokens, temperature, top_p):
    """
    ui_msgs   : messages for gr.Chatbot display (content as markdown string, possibly containing base64 images)
    qwen_msgs : messages for Qwen2-VL inference (content is a list of dicts with type:text/image, images as PIL)
    """
    if _loaded["model"] is None:
        return ui_msgs, qwen_msgs, None, "❌ Model not loaded. Please load the model first."
    if (not user_text or not user_text.strip()) and user_image is None:
        return ui_msgs, qwen_msgs, None, "❌ Please provide at least text or an image."

    model = _loaded["model"]
    processor = _loaded["processor"]

    # -------- UI display: embed image in markdown with visible line breaks --------
    ui_user_parts = []
    if user_text and user_text.strip():
        ui_user_parts.append(_md_linebreak(user_text.strip()))
    if user_image is not None:
        data_uri = _pil_to_data_uri(user_image, max_side=512)
        ui_user_parts.append(f"![image]({data_uri})")
    ui_user_display = "\n\n".join(ui_user_parts) if ui_user_parts else "[Image]"

    ui_msgs = list(ui_msgs or [])
    ui_msgs.append({"role": "user", "content": ui_user_display})

    # -------- Model inference: preserve raw structure (don't put base64 into model context) --------
    qwen_msgs = list(qwen_msgs or [])
    user_content = []
    if user_image is not None:
        user_content.append({"type": "image", "image": user_image})
    if user_text and user_text.strip():
        user_content.append({"type": "text", "text": user_text.strip()})
    qwen_msgs.append({"role": "user", "content": user_content})

    # Assemble inputs
    text = processor.apply_chat_template(qwen_msgs, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(qwen_msgs)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # Move to the device where the model resides (supports user-specified GPU)
    try:
        target_device = next(model.parameters()).device
    except StopIteration:
        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = inputs.to(target_device)

    gen_kwargs = {
        "max_new_tokens": int(max_new_tokens),
        "do_sample": float(temperature) > 0,
        "temperature": float(temperature),
        "top_p": float(top_p),
    }

    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)

    out_trim = [o[len(i):] for i, o in zip(inputs.input_ids, out)]
    resp = processor.batch_decode(out_trim, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    # Update both histories
    ui_msgs.append({"role": "assistant", "content": _md_linebreak(resp)})
    qwen_msgs.append({"role": "assistant", "content": [{"type": "text", "text": resp}]})

    # Return: ui_msgs for Chatbot; clear input image (img output None), status message
    return ui_msgs, qwen_msgs, None, "✅ Response generated (image cleared but remains in chat)."

def clear_chat():
    return [], [], None, "✅ Chat cleared."


with gr.Blocks(title="Qwen2-VL merged local inference WebUI") as demo:
    gr.Markdown("## Qwen2-VL (merged checkpoint) local deployment and inference")

    model_dir = gr.Textbox(label="Model directory", value=MODEL_DIR_DEFAULT)

    with gr.Row():
        device_pref = gr.Dropdown(choices=["auto", "cuda", "cpu"], value="auto", label="Device")
        dtype_pref = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto", label="dtype")
        use_flash_attn2 = gr.Checkbox(value=False, label="flash_attention_2 (enable if environment supports)")

    with gr.Row():
        gpu_ids_str = gr.Textbox(
            label="Restrict to GPU IDs (optional)",
            placeholder='e.g., "0" or "0,1"; leave empty for no restriction',
            value=""
        )
        gpu_max_memory = gr.Textbox(
            label="Max memory per GPU (max_memory)",
            placeholder='e.g., "20GiB"',
            value="20GiB"
        )
        cpu_max_memory = gr.Textbox(
            label="CPU fallback max_memory (optional)",
            placeholder='e.g., "64GiB"; leave empty to skip',
            value="64GiB"
        )

    with gr.Row():
        btn_load = gr.Button("Load model", variant="primary")
        btn_unload = gr.Button("Unload model", variant="stop")
        btn_clear = gr.Button("Clear chat")

    status = gr.Markdown("(Status display)")

    chatbot = gr.Chatbot(label="Chat", height=520, type="messages")
    state_ui = gr.State([])      # for Chatbot display
    state_qwen = gr.State([])    # for model inference context (with images)

    with gr.Row():
        with gr.Column():
            img_path = gr.Textbox(label="Load image from path (optional)", placeholder="/abs/path/to/image.jpg")
            btn_load_img = gr.Button("Load image to panel")
        img = gr.Image(type="pil", label="Uploaded image (optional)", height=300)

        with gr.Column():
            user_text = gr.Textbox(lines=6, label="Input text (optional)")
            with gr.Row():
                max_new_tokens = gr.Slider(1, 2048, value=256, step=1, label="max_new_tokens")
                temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="temperature")
                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
            btn_send = gr.Button("Send", variant="primary")

    # Load image button: read path into img component
    btn_load_img.click(load_image_from_path, [img_path], [img, status])

    # Load/unload model
    btn_load.click(
        load_model,
        [model_dir, device_pref, dtype_pref, use_flash_attn2, gpu_ids_str, gpu_max_memory, cpu_max_memory],
        status
    )
    btn_unload.click(unload_model, [], status)

    # Clear chat
    btn_clear.click(clear_chat, [], [chatbot, state_qwen, img, status]).then(lambda: [], [], state_ui)

    btn_send.click(
        chat_generate,
        [state_ui, state_qwen, user_text, img, max_new_tokens, temperature, top_p],
        [chatbot, state_qwen, img, status],
    ).then(lambda x: x, chatbot, state_ui)

demo.queue(max_size=32).launch(
    server_name="0.0.0.0",
    server_port=None,   # automatically find a free port
    show_api=False
)

Suggestions using the following prompt words:

**Englsih Version:**
Input: An image-text pair: <image> <text>.
Task: Perform a comprehensive metaphor analysis.
Process: Internally evaluate the input for metaphorical content. Do not output your reasoning steps, only the final assessment.
Output Requirements:
1. Metaphor Presence: State "Yes" or "No".
2. If "Yes":
   - Emotional Perspective: [Explanation]
   - Intentional Perspective: [Explanation]
   - Offensive Perspective: [Explanation]
   - Mapping Process: Describe the 'source' and 'target' domains of the metaphor.
3. If "No":
   - Reason: Provide a concise explanation for the absence of metaphor.

**中文版本:**
输入:一个图文对:<image> <text>。
任务:执行全面的隐喻分析。
过程:请在内部对输入内容进行隐喻评估。无需输出推理步骤,仅提供最终评估结果。
输出要求:
1. 隐喻存在性:回答“是”或“否”。
2. 如果为“是”:
   - 情感视角:[解释]
   - 意图视角:[解释]
   - 冒犯视角:[解释]
   - 映射过程:描述隐喻的“源域”和“目标域”。
3. 如果为“否”:
   - 原因:简要说明不存在隐喻的原因。

In order to effectively evaluate the MAR algorithm and the quality of the constructed dataset, we randomly sampled 200 pieces of data from the MET-Meme 2.0 dataset and manually annotated them. The results showed that, compared with VLMs such as GPT-5.1-Thinking, Kimi K2.5, Claude-Opus-4-6-Thinking, and Gemini-3-Flash-Preview, the dataset constructed by the MAR algorithm has the greatest similarity to the human-annotated dataset, and is 17% better than the best VLMs used as the Judge model (Gemini-3-Flash-Preview). As shown in the figure below.

Figure

Downloads last month
8
Safetensors
Model size
8B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support