hxssgaa commited on
Commit
5a4159a
·
verified ·
1 Parent(s): a9d4f11

Upload checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ColQwen3"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_colqwen3.ColQwen3Config",
7
+ "AutoModel": "modeling_colqwen3.ColQwen3"
8
+ },
9
+ "dtype": "bfloat16",
10
+ "embed_dim": 320,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "max_num_visual_tokens": 1280,
14
+ "model_type": "colqwen3",
15
+ "padding_side": "left",
16
+ "text_config": {
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "bos_token_id": 151643,
20
+ "dtype": "float32",
21
+ "eos_token_id": 151645,
22
+ "head_dim": 128,
23
+ "hidden_act": "silu",
24
+ "hidden_size": 4096,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 12288,
27
+ "max_position_embeddings": 262144,
28
+ "model_type": "qwen3_vl_text",
29
+ "num_attention_heads": 32,
30
+ "num_hidden_layers": 36,
31
+ "num_key_value_heads": 8,
32
+ "rms_norm_eps": 1e-06,
33
+ "rope_scaling": {
34
+ "mrope_interleaved": true,
35
+ "mrope_section": [
36
+ 24,
37
+ 20,
38
+ 20
39
+ ],
40
+ "rope_type": "default"
41
+ },
42
+ "rope_theta": 5000000,
43
+ "use_cache": true,
44
+ "vocab_size": 151936
45
+ },
46
+ "tie_word_embeddings": false,
47
+ "transformers_version": "4.57.1",
48
+ "video_token_id": 151656,
49
+ "vision_config": {
50
+ "deepstack_visual_indexes": [
51
+ 8,
52
+ 16,
53
+ 24
54
+ ],
55
+ "depth": 27,
56
+ "dtype": "float32",
57
+ "hidden_act": "gelu_pytorch_tanh",
58
+ "hidden_size": 1152,
59
+ "in_channels": 3,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4304,
62
+ "model_type": "qwen3_vl",
63
+ "num_heads": 16,
64
+ "num_position_embeddings": 2304,
65
+ "out_hidden_size": 4096,
66
+ "patch_size": 16,
67
+ "spatial_merge_size": 2,
68
+ "temporal_patch_size": 2
69
+ },
70
+ "vision_end_token_id": 151653,
71
+ "vision_start_token_id": 151652
72
+ }
configuration_colqwen3.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # Copyright 2025 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ Configuration for ColQwen3, adapted to mirror the ColQwen2 structure.
17
+ """
18
+
19
+ from copy import deepcopy
20
+ from typing import Any
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.models.auto import CONFIG_MAPPING
24
+ from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig, Qwen3VLVisionConfig
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class ColQwen3Config(PretrainedConfig):
32
+ """Configuration for ColQwen3 retrieval model."""
33
+
34
+ model_type = "colqwen3"
35
+ sub_configs: dict[str, Any] = {"vision_config": Qwen3VLVisionConfig, "text_config": Qwen3VLTextConfig}
36
+
37
+ def __init__(
38
+ self,
39
+ vision_config: Any = None,
40
+ text_config: Any = None,
41
+ embed_dim: int = 320,
42
+ padding_side: str = "left",
43
+ initializer_range: float = 0.02,
44
+ dtype: str | None = None,
45
+ **kwargs,
46
+ ):
47
+ if vision_config is None or text_config is None:
48
+ base_vlm_config = CONFIG_MAPPING["qwen3_vl"]()
49
+ if vision_config is None:
50
+ vision_config = deepcopy(base_vlm_config.vision_config)
51
+ logger.info("`vision_config` is `None`. Initializing with the default `Qwen3VLVisionConfig`.")
52
+ if text_config is None:
53
+ text_config = deepcopy(base_vlm_config.text_config)
54
+ logger.info("`text_config` is `None`. Initializing with the default `Qwen3VLTextConfig`.")
55
+
56
+ if isinstance(vision_config, dict):
57
+ vision_config = Qwen3VLVisionConfig(**deepcopy(vision_config))
58
+ elif not isinstance(vision_config, PretrainedConfig):
59
+ raise TypeError(
60
+ f"Invalid type for `vision_config`. Expected `PretrainedConfig`, `dict`, or `None`, got {type(vision_config)}."
61
+ )
62
+
63
+ if isinstance(text_config, dict):
64
+ text_config = Qwen3VLTextConfig(**deepcopy(text_config))
65
+ elif not isinstance(text_config, PretrainedConfig):
66
+ raise TypeError(
67
+ f"Invalid type for `text_config`. Expected `PretrainedConfig`, `dict`, or `None`, got {type(text_config)}."
68
+ )
69
+
70
+ if embed_dim <= 0:
71
+ raise ValueError(f"`embed_dim` must be positive, got {embed_dim}.")
72
+
73
+ super().__init__(**kwargs)
74
+ self.vision_config = vision_config
75
+ self.text_config = text_config
76
+ self.embed_dim = embed_dim
77
+ self.padding_side = padding_side
78
+ self.initializer_range = initializer_range
79
+ # Preserve incoming dtype so downstream models avoid attribute errors
80
+ self.dtype = dtype or getattr(self, "dtype", None)
81
+
82
+ @classmethod
83
+ def from_base_config(cls, base_config: PretrainedConfig) -> "ColQwen3Config":
84
+ """Upgrade a base Qwen3VLConfig-like config into ColQwen3Config."""
85
+ if isinstance(base_config, dict):
86
+ data = dict(base_config)
87
+ else:
88
+ data = base_config.to_dict()
89
+
90
+ vision_cfg = data.get("vision_config")
91
+ if isinstance(vision_cfg, dict):
92
+ data["vision_config"] = Qwen3VLVisionConfig.from_dict(vision_cfg)
93
+
94
+ text_cfg = data.get("text_config")
95
+ if isinstance(text_cfg, dict):
96
+ data["text_config"] = Qwen3VLTextConfig.from_dict(text_cfg)
97
+
98
+ data.setdefault("model_type", cls.model_type)
99
+ if hasattr(base_config, "dtype"):
100
+ data.setdefault("dtype", getattr(base_config, "dtype"))
101
+ elif hasattr(base_config, "torch_dtype") and base_config.torch_dtype is not None:
102
+ data.setdefault("dtype", str(base_config.torch_dtype))
103
+
104
+ return cls.from_dict(data)
105
+
106
+ def get_text_config(self, *args, **kwargs) -> PretrainedConfig:
107
+ return self.text_config
108
+
109
+
110
+ DEFAULT_CONFIG = ColQwen3Config()
111
+
112
+ __all__ = ["ColQwen3Config", "DEFAULT_CONFIG"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b928ac167631ed249407daa9e16978d845c13903595c7bc6402e6e8f3a8728
3
+ size 4998058256
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239a5e98a9cccf7c433c4d5efb4e78ab02b449638d0580b606ef990ea12925c2
3
+ size 4915963032
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a4a0136e1c9b2dbe86ddea10bbae338847c37663ead885ea9f0259303c299b
3
+ size 4915963064
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7915f327abfb2fe4ab7ccc685eb30a8faa12792ae634f922a8e1387609663f8d
3
+ size 2706980400
model.safetensors.index.json ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 8768434736,
4
+ "total_size": 17536869472
5
+ },
6
+ "weight_map": {
7
+ "embedding_proj_layer.bias": "model-00004-of-00004.safetensors",
8
+ "embedding_proj_layer.weight": "model-00004-of-00004.safetensors",
9
+ "vlm.lm_head.weight": "model-00004-of-00004.safetensors",
10
+ "vlm.model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
11
+ "vlm.model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "vlm.model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
13
+ "vlm.model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
14
+ "vlm.model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
15
+ "vlm.model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
16
+ "vlm.model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
17
+ "vlm.model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
18
+ "vlm.model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
19
+ "vlm.model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
20
+ "vlm.model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
21
+ "vlm.model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
22
+ "vlm.model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
23
+ "vlm.model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
24
+ "vlm.model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
25
+ "vlm.model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
26
+ "vlm.model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
27
+ "vlm.model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
28
+ "vlm.model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
29
+ "vlm.model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
30
+ "vlm.model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
31
+ "vlm.model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
32
+ "vlm.model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "vlm.model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "vlm.model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "vlm.model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "vlm.model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "vlm.model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "vlm.model.language_model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
39
+ "vlm.model.language_model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "vlm.model.language_model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "vlm.model.language_model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
42
+ "vlm.model.language_model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "vlm.model.language_model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "vlm.model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "vlm.model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "vlm.model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "vlm.model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "vlm.model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "vlm.model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
50
+ "vlm.model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "vlm.model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "vlm.model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
53
+ "vlm.model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "vlm.model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
55
+ "vlm.model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
56
+ "vlm.model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
57
+ "vlm.model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
58
+ "vlm.model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
59
+ "vlm.model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
60
+ "vlm.model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
61
+ "vlm.model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
62
+ "vlm.model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
63
+ "vlm.model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
64
+ "vlm.model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
65
+ "vlm.model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
66
+ "vlm.model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "vlm.model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
68
+ "vlm.model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
69
+ "vlm.model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
70
+ "vlm.model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
71
+ "vlm.model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
72
+ "vlm.model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
73
+ "vlm.model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
74
+ "vlm.model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
75
+ "vlm.model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
76
+ "vlm.model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
77
+ "vlm.model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
78
+ "vlm.model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
79
+ "vlm.model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
80
+ "vlm.model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
81
+ "vlm.model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "vlm.model.language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
83
+ "vlm.model.language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
84
+ "vlm.model.language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
85
+ "vlm.model.language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
86
+ "vlm.model.language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
87
+ "vlm.model.language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
88
+ "vlm.model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
89
+ "vlm.model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
90
+ "vlm.model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
91
+ "vlm.model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
92
+ "vlm.model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "vlm.model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
94
+ "vlm.model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
95
+ "vlm.model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
96
+ "vlm.model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
97
+ "vlm.model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
98
+ "vlm.model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
99
+ "vlm.model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
100
+ "vlm.model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
101
+ "vlm.model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
102
+ "vlm.model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
103
+ "vlm.model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
104
+ "vlm.model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
105
+ "vlm.model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
106
+ "vlm.model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
107
+ "vlm.model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
108
+ "vlm.model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
109
+ "vlm.model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
110
+ "vlm.model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
111
+ "vlm.model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
112
+ "vlm.model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
113
+ "vlm.model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
114
+ "vlm.model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
115
+ "vlm.model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
116
+ "vlm.model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
117
+ "vlm.model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
118
+ "vlm.model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
119
+ "vlm.model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
120
+ "vlm.model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
121
+ "vlm.model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "vlm.model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
123
+ "vlm.model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
124
+ "vlm.model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
125
+ "vlm.model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
126
+ "vlm.model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
127
+ "vlm.model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
128
+ "vlm.model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
129
+ "vlm.model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
130
+ "vlm.model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
131
+ "vlm.model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
132
+ "vlm.model.language_model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "vlm.model.language_model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
134
+ "vlm.model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
135
+ "vlm.model.language_model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
136
+ "vlm.model.language_model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
137
+ "vlm.model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
138
+ "vlm.model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
139
+ "vlm.model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
140
+ "vlm.model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
141
+ "vlm.model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
142
+ "vlm.model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
143
+ "vlm.model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
144
+ "vlm.model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
145
+ "vlm.model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
146
+ "vlm.model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
147
+ "vlm.model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
148
+ "vlm.model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
149
+ "vlm.model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
150
+ "vlm.model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
151
+ "vlm.model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
152
+ "vlm.model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
153
+ "vlm.model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
154
+ "vlm.model.language_model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
155
+ "vlm.model.language_model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
156
+ "vlm.model.language_model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
157
+ "vlm.model.language_model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
158
+ "vlm.model.language_model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
159
+ "vlm.model.language_model.layers.20.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
160
+ "vlm.model.language_model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
161
+ "vlm.model.language_model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
162
+ "vlm.model.language_model.layers.20.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
163
+ "vlm.model.language_model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
164
+ "vlm.model.language_model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
165
+ "vlm.model.language_model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "vlm.model.language_model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "vlm.model.language_model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "vlm.model.language_model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "vlm.model.language_model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "vlm.model.language_model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
171
+ "vlm.model.language_model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "vlm.model.language_model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "vlm.model.language_model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
174
+ "vlm.model.language_model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "vlm.model.language_model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "vlm.model.language_model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "vlm.model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "vlm.model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "vlm.model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "vlm.model.language_model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "vlm.model.language_model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
182
+ "vlm.model.language_model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "vlm.model.language_model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "vlm.model.language_model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
185
+ "vlm.model.language_model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "vlm.model.language_model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
187
+ "vlm.model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
188
+ "vlm.model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
189
+ "vlm.model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
190
+ "vlm.model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
191
+ "vlm.model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
192
+ "vlm.model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
193
+ "vlm.model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
194
+ "vlm.model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
195
+ "vlm.model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
196
+ "vlm.model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
197
+ "vlm.model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
198
+ "vlm.model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
199
+ "vlm.model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
200
+ "vlm.model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
201
+ "vlm.model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
202
+ "vlm.model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
203
+ "vlm.model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
204
+ "vlm.model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
205
+ "vlm.model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
206
+ "vlm.model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
207
+ "vlm.model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
208
+ "vlm.model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
209
+ "vlm.model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
210
+ "vlm.model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
211
+ "vlm.model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
212
+ "vlm.model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
213
+ "vlm.model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "vlm.model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
215
+ "vlm.model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
216
+ "vlm.model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
217
+ "vlm.model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
218
+ "vlm.model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
219
+ "vlm.model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
220
+ "vlm.model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
221
+ "vlm.model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
222
+ "vlm.model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
223
+ "vlm.model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
224
+ "vlm.model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "vlm.model.language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
226
+ "vlm.model.language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
227
+ "vlm.model.language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
228
+ "vlm.model.language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
229
+ "vlm.model.language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
230
+ "vlm.model.language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
231
+ "vlm.model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
232
+ "vlm.model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
233
+ "vlm.model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
234
+ "vlm.model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
235
+ "vlm.model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
236
+ "vlm.model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
237
+ "vlm.model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
238
+ "vlm.model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
239
+ "vlm.model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
240
+ "vlm.model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
241
+ "vlm.model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "vlm.model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
243
+ "vlm.model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
244
+ "vlm.model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
245
+ "vlm.model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
246
+ "vlm.model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
247
+ "vlm.model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
248
+ "vlm.model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
249
+ "vlm.model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
250
+ "vlm.model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
251
+ "vlm.model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
252
+ "vlm.model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
253
+ "vlm.model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "vlm.model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
255
+ "vlm.model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
256
+ "vlm.model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
257
+ "vlm.model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
258
+ "vlm.model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
259
+ "vlm.model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
260
+ "vlm.model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
261
+ "vlm.model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
262
+ "vlm.model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
263
+ "vlm.model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
264
+ "vlm.model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "vlm.model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
266
+ "vlm.model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
267
+ "vlm.model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
268
+ "vlm.model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
269
+ "vlm.model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
270
+ "vlm.model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
271
+ "vlm.model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
272
+ "vlm.model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
273
+ "vlm.model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
274
+ "vlm.model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
275
+ "vlm.model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
276
+ "vlm.model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
277
+ "vlm.model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
278
+ "vlm.model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
279
+ "vlm.model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
280
+ "vlm.model.language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
281
+ "vlm.model.language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
282
+ "vlm.model.language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
283
+ "vlm.model.language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
284
+ "vlm.model.language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
285
+ "vlm.model.language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
286
+ "vlm.model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
287
+ "vlm.model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
288
+ "vlm.model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
289
+ "vlm.model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
290
+ "vlm.model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
291
+ "vlm.model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
292
+ "vlm.model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
293
+ "vlm.model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
294
+ "vlm.model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
295
+ "vlm.model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
296
+ "vlm.model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
297
+ "vlm.model.language_model.layers.32.input_layernorm.weight": "model-00004-of-00004.safetensors",
298
+ "vlm.model.language_model.layers.32.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
299
+ "vlm.model.language_model.layers.32.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
300
+ "vlm.model.language_model.layers.32.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
301
+ "vlm.model.language_model.layers.32.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
302
+ "vlm.model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
303
+ "vlm.model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
304
+ "vlm.model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
305
+ "vlm.model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
306
+ "vlm.model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
307
+ "vlm.model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
308
+ "vlm.model.language_model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
309
+ "vlm.model.language_model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
310
+ "vlm.model.language_model.layers.33.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
311
+ "vlm.model.language_model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
312
+ "vlm.model.language_model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
313
+ "vlm.model.language_model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
314
+ "vlm.model.language_model.layers.33.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
315
+ "vlm.model.language_model.layers.33.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
316
+ "vlm.model.language_model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
317
+ "vlm.model.language_model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
318
+ "vlm.model.language_model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
319
+ "vlm.model.language_model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
320
+ "vlm.model.language_model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
321
+ "vlm.model.language_model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
322
+ "vlm.model.language_model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
323
+ "vlm.model.language_model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
324
+ "vlm.model.language_model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
325
+ "vlm.model.language_model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
326
+ "vlm.model.language_model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
327
+ "vlm.model.language_model.layers.34.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
328
+ "vlm.model.language_model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
329
+ "vlm.model.language_model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
330
+ "vlm.model.language_model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
331
+ "vlm.model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
332
+ "vlm.model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
333
+ "vlm.model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
334
+ "vlm.model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
335
+ "vlm.model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
336
+ "vlm.model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
337
+ "vlm.model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
338
+ "vlm.model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
339
+ "vlm.model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
340
+ "vlm.model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
341
+ "vlm.model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
342
+ "vlm.model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
343
+ "vlm.model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
344
+ "vlm.model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
345
+ "vlm.model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
346
+ "vlm.model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
347
+ "vlm.model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
348
+ "vlm.model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
349
+ "vlm.model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
350
+ "vlm.model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
351
+ "vlm.model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
352
+ "vlm.model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
353
+ "vlm.model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
354
+ "vlm.model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
355
+ "vlm.model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
356
+ "vlm.model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
357
+ "vlm.model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
358
+ "vlm.model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
359
+ "vlm.model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
360
+ "vlm.model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
361
+ "vlm.model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
362
+ "vlm.model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
363
+ "vlm.model.language_model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
364
+ "vlm.model.language_model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
365
+ "vlm.model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
366
+ "vlm.model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
367
+ "vlm.model.language_model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
368
+ "vlm.model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
369
+ "vlm.model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
370
+ "vlm.model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
371
+ "vlm.model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
372
+ "vlm.model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
373
+ "vlm.model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
374
+ "vlm.model.language_model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
375
+ "vlm.model.language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
376
+ "vlm.model.language_model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
377
+ "vlm.model.language_model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
378
+ "vlm.model.language_model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
379
+ "vlm.model.language_model.layers.7.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
380
+ "vlm.model.language_model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
381
+ "vlm.model.language_model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
382
+ "vlm.model.language_model.layers.7.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
383
+ "vlm.model.language_model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
384
+ "vlm.model.language_model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
385
+ "vlm.model.language_model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
386
+ "vlm.model.language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
387
+ "vlm.model.language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
388
+ "vlm.model.language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
389
+ "vlm.model.language_model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
390
+ "vlm.model.language_model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
391
+ "vlm.model.language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
392
+ "vlm.model.language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
393
+ "vlm.model.language_model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
394
+ "vlm.model.language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
395
+ "vlm.model.language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
396
+ "vlm.model.language_model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
397
+ "vlm.model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
398
+ "vlm.model.language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
399
+ "vlm.model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
400
+ "vlm.model.language_model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
401
+ "vlm.model.language_model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
402
+ "vlm.model.language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
403
+ "vlm.model.language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
404
+ "vlm.model.language_model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
405
+ "vlm.model.language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
406
+ "vlm.model.language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
407
+ "vlm.model.language_model.norm.weight": "model-00004-of-00004.safetensors",
408
+ "vlm.model.visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
409
+ "vlm.model.visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
410
+ "vlm.model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
411
+ "vlm.model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
412
+ "vlm.model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
413
+ "vlm.model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
414
+ "vlm.model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
415
+ "vlm.model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
416
+ "vlm.model.visual.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
417
+ "vlm.model.visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
418
+ "vlm.model.visual.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
419
+ "vlm.model.visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
420
+ "vlm.model.visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
421
+ "vlm.model.visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
422
+ "vlm.model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
423
+ "vlm.model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
424
+ "vlm.model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
425
+ "vlm.model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
426
+ "vlm.model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
427
+ "vlm.model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
428
+ "vlm.model.visual.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
429
+ "vlm.model.visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
430
+ "vlm.model.visual.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
431
+ "vlm.model.visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
432
+ "vlm.model.visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
433
+ "vlm.model.visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
434
+ "vlm.model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
435
+ "vlm.model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
436
+ "vlm.model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
437
+ "vlm.model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
438
+ "vlm.model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
439
+ "vlm.model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
440
+ "vlm.model.visual.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
441
+ "vlm.model.visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
442
+ "vlm.model.visual.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
443
+ "vlm.model.visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
444
+ "vlm.model.visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
445
+ "vlm.model.visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
446
+ "vlm.model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
447
+ "vlm.model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
448
+ "vlm.model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
449
+ "vlm.model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
450
+ "vlm.model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
451
+ "vlm.model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
452
+ "vlm.model.visual.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
453
+ "vlm.model.visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
454
+ "vlm.model.visual.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
455
+ "vlm.model.visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
456
+ "vlm.model.visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
457
+ "vlm.model.visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
458
+ "vlm.model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
459
+ "vlm.model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
460
+ "vlm.model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
461
+ "vlm.model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
462
+ "vlm.model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
463
+ "vlm.model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
464
+ "vlm.model.visual.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
465
+ "vlm.model.visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
466
+ "vlm.model.visual.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
467
+ "vlm.model.visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
468
+ "vlm.model.visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
469
+ "vlm.model.visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
470
+ "vlm.model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
471
+ "vlm.model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
472
+ "vlm.model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
473
+ "vlm.model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
474
+ "vlm.model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
475
+ "vlm.model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
476
+ "vlm.model.visual.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
477
+ "vlm.model.visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
478
+ "vlm.model.visual.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
479
+ "vlm.model.visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
480
+ "vlm.model.visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
481
+ "vlm.model.visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
482
+ "vlm.model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
483
+ "vlm.model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
484
+ "vlm.model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
485
+ "vlm.model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
486
+ "vlm.model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
487
+ "vlm.model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
488
+ "vlm.model.visual.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
489
+ "vlm.model.visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
490
+ "vlm.model.visual.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
491
+ "vlm.model.visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
492
+ "vlm.model.visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
493
+ "vlm.model.visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
494
+ "vlm.model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
495
+ "vlm.model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
496
+ "vlm.model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
497
+ "vlm.model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
498
+ "vlm.model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
499
+ "vlm.model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
500
+ "vlm.model.visual.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
501
+ "vlm.model.visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
502
+ "vlm.model.visual.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
503
+ "vlm.model.visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
504
+ "vlm.model.visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
505
+ "vlm.model.visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
506
+ "vlm.model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
507
+ "vlm.model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
508
+ "vlm.model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
509
+ "vlm.model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
510
+ "vlm.model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
511
+ "vlm.model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
512
+ "vlm.model.visual.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
513
+ "vlm.model.visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
514
+ "vlm.model.visual.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
515
+ "vlm.model.visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
516
+ "vlm.model.visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
517
+ "vlm.model.visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
518
+ "vlm.model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
519
+ "vlm.model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
520
+ "vlm.model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
521
+ "vlm.model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
522
+ "vlm.model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
523
+ "vlm.model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
524
+ "vlm.model.visual.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
525
+ "vlm.model.visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
526
+ "vlm.model.visual.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
527
+ "vlm.model.visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
528
+ "vlm.model.visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
529
+ "vlm.model.visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
530
+ "vlm.model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
531
+ "vlm.model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
532
+ "vlm.model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
533
+ "vlm.model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
534
+ "vlm.model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
535
+ "vlm.model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
536
+ "vlm.model.visual.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
537
+ "vlm.model.visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
538
+ "vlm.model.visual.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
539
+ "vlm.model.visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
540
+ "vlm.model.visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
541
+ "vlm.model.visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
542
+ "vlm.model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
543
+ "vlm.model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
544
+ "vlm.model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
545
+ "vlm.model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
546
+ "vlm.model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
547
+ "vlm.model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
548
+ "vlm.model.visual.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
549
+ "vlm.model.visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
550
+ "vlm.model.visual.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
551
+ "vlm.model.visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
552
+ "vlm.model.visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
553
+ "vlm.model.visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
554
+ "vlm.model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
555
+ "vlm.model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
556
+ "vlm.model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
557
+ "vlm.model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
558
+ "vlm.model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
559
+ "vlm.model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
560
+ "vlm.model.visual.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
561
+ "vlm.model.visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
562
+ "vlm.model.visual.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
563
+ "vlm.model.visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
564
+ "vlm.model.visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
565
+ "vlm.model.visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
566
+ "vlm.model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
567
+ "vlm.model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
568
+ "vlm.model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
569
+ "vlm.model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
570
+ "vlm.model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
571
+ "vlm.model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
572
+ "vlm.model.visual.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
573
+ "vlm.model.visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
574
+ "vlm.model.visual.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
575
+ "vlm.model.visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
576
+ "vlm.model.visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
577
+ "vlm.model.visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
578
+ "vlm.model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
579
+ "vlm.model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
580
+ "vlm.model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
581
+ "vlm.model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
582
+ "vlm.model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
583
+ "vlm.model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
584
+ "vlm.model.visual.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
585
+ "vlm.model.visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
586
+ "vlm.model.visual.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
587
+ "vlm.model.visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
588
+ "vlm.model.visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
589
+ "vlm.model.visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
590
+ "vlm.model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
591
+ "vlm.model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
592
+ "vlm.model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
593
+ "vlm.model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
594
+ "vlm.model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
595
+ "vlm.model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
596
+ "vlm.model.visual.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
597
+ "vlm.model.visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
598
+ "vlm.model.visual.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
599
+ "vlm.model.visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
600
+ "vlm.model.visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
601
+ "vlm.model.visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
602
+ "vlm.model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
603
+ "vlm.model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
604
+ "vlm.model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
605
+ "vlm.model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
606
+ "vlm.model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
607
+ "vlm.model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
608
+ "vlm.model.visual.blocks.23.norm1.bias": "model-00001-of-00004.safetensors",
609
+ "vlm.model.visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
610
+ "vlm.model.visual.blocks.23.norm2.bias": "model-00001-of-00004.safetensors",
611
+ "vlm.model.visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
612
+ "vlm.model.visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
613
+ "vlm.model.visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
614
+ "vlm.model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
615
+ "vlm.model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
616
+ "vlm.model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
617
+ "vlm.model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
618
+ "vlm.model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
619
+ "vlm.model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
620
+ "vlm.model.visual.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
621
+ "vlm.model.visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
622
+ "vlm.model.visual.blocks.24.norm2.bias": "model-00001-of-00004.safetensors",
623
+ "vlm.model.visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
624
+ "vlm.model.visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
625
+ "vlm.model.visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
626
+ "vlm.model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
627
+ "vlm.model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
628
+ "vlm.model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
629
+ "vlm.model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
630
+ "vlm.model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
631
+ "vlm.model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
632
+ "vlm.model.visual.blocks.25.norm1.bias": "model-00001-of-00004.safetensors",
633
+ "vlm.model.visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
634
+ "vlm.model.visual.blocks.25.norm2.bias": "model-00001-of-00004.safetensors",
635
+ "vlm.model.visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
636
+ "vlm.model.visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
637
+ "vlm.model.visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
638
+ "vlm.model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
639
+ "vlm.model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
640
+ "vlm.model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
641
+ "vlm.model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
642
+ "vlm.model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
643
+ "vlm.model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
644
+ "vlm.model.visual.blocks.26.norm1.bias": "model-00001-of-00004.safetensors",
645
+ "vlm.model.visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
646
+ "vlm.model.visual.blocks.26.norm2.bias": "model-00001-of-00004.safetensors",
647
+ "vlm.model.visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
648
+ "vlm.model.visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
649
+ "vlm.model.visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
650
+ "vlm.model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
651
+ "vlm.model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
652
+ "vlm.model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
653
+ "vlm.model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
654
+ "vlm.model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
655
+ "vlm.model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
656
+ "vlm.model.visual.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
657
+ "vlm.model.visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
658
+ "vlm.model.visual.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
659
+ "vlm.model.visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
660
+ "vlm.model.visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
661
+ "vlm.model.visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
662
+ "vlm.model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
663
+ "vlm.model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
664
+ "vlm.model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
665
+ "vlm.model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
666
+ "vlm.model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
667
+ "vlm.model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
668
+ "vlm.model.visual.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
669
+ "vlm.model.visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
670
+ "vlm.model.visual.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
671
+ "vlm.model.visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
672
+ "vlm.model.visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
673
+ "vlm.model.visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
674
+ "vlm.model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
675
+ "vlm.model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
676
+ "vlm.model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
677
+ "vlm.model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
678
+ "vlm.model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
679
+ "vlm.model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
680
+ "vlm.model.visual.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
681
+ "vlm.model.visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
682
+ "vlm.model.visual.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
683
+ "vlm.model.visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
684
+ "vlm.model.visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
685
+ "vlm.model.visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
686
+ "vlm.model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
687
+ "vlm.model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
688
+ "vlm.model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
689
+ "vlm.model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
690
+ "vlm.model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
691
+ "vlm.model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
692
+ "vlm.model.visual.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
693
+ "vlm.model.visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
694
+ "vlm.model.visual.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
695
+ "vlm.model.visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
696
+ "vlm.model.visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
697
+ "vlm.model.visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
698
+ "vlm.model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
699
+ "vlm.model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
700
+ "vlm.model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
701
+ "vlm.model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
702
+ "vlm.model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
703
+ "vlm.model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
704
+ "vlm.model.visual.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
705
+ "vlm.model.visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
706
+ "vlm.model.visual.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
707
+ "vlm.model.visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
708
+ "vlm.model.visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
709
+ "vlm.model.visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
710
+ "vlm.model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
711
+ "vlm.model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
712
+ "vlm.model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
713
+ "vlm.model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
714
+ "vlm.model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
715
+ "vlm.model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
716
+ "vlm.model.visual.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
717
+ "vlm.model.visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
718
+ "vlm.model.visual.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
719
+ "vlm.model.visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
720
+ "vlm.model.visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
721
+ "vlm.model.visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
722
+ "vlm.model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
723
+ "vlm.model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
724
+ "vlm.model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
725
+ "vlm.model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
726
+ "vlm.model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
727
+ "vlm.model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
728
+ "vlm.model.visual.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
729
+ "vlm.model.visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
730
+ "vlm.model.visual.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
731
+ "vlm.model.visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
732
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00004.safetensors",
733
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00004.safetensors",
734
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00004.safetensors",
735
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00004.safetensors",
736
+ "vlm.model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00004.safetensors",
737
+ "vlm.model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00004.safetensors",
738
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00004.safetensors",
739
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00004.safetensors",
740
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00004.safetensors",
741
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00004.safetensors",
742
+ "vlm.model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00004.safetensors",
743
+ "vlm.model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00004.safetensors",
744
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00004.safetensors",
745
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00004.safetensors",
746
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00004.safetensors",
747
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00004.safetensors",
748
+ "vlm.model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00004.safetensors",
749
+ "vlm.model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00004.safetensors",
750
+ "vlm.model.visual.merger.linear_fc1.bias": "model-00001-of-00004.safetensors",
751
+ "vlm.model.visual.merger.linear_fc1.weight": "model-00001-of-00004.safetensors",
752
+ "vlm.model.visual.merger.linear_fc2.bias": "model-00001-of-00004.safetensors",
753
+ "vlm.model.visual.merger.linear_fc2.weight": "model-00001-of-00004.safetensors",
754
+ "vlm.model.visual.merger.norm.bias": "model-00001-of-00004.safetensors",
755
+ "vlm.model.visual.merger.norm.weight": "model-00001-of-00004.safetensors",
756
+ "vlm.model.visual.patch_embed.proj.bias": "model-00001-of-00004.safetensors",
757
+ "vlm.model.visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
758
+ "vlm.model.visual.pos_embed.weight": "model-00001-of-00004.safetensors"
759
+ }
760
+ }
modeling_colqwen3.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # Copyright 2025 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ Modeling for ColQwen3 retrieval, aligned with the ColQwen2 reference implementation.
17
+ """
18
+
19
+ from dataclasses import dataclass
20
+ from typing import Optional
21
+
22
+ from torch import nn
23
+ from transformers import AutoModelForImageTextToText
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.cache_utils import Cache
26
+ from transformers.modeling_utils import PreTrainedModel
27
+ from transformers.utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging
28
+ from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
29
+
30
+ from .configuration_colqwen3 import ColQwen3Config
31
+
32
+
33
+ if is_torch_available():
34
+ import torch
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+
39
+ @auto_docstring
40
+ class ColQwen3PreTrainedModel(PreTrainedModel):
41
+ config_class = ColQwen3Config
42
+ base_model_prefix = "model"
43
+ _no_split_modules = []
44
+ _supports_sdpa = True
45
+ _supports_flash_attn = True
46
+ _supports_flex_attn = True
47
+
48
+ def _init_weights(self, module):
49
+ std = (
50
+ self.config.initializer_range
51
+ if hasattr(self.config, "initializer_range")
52
+ else getattr(self.config.text_config, "initializer_range", 0.02)
53
+ )
54
+
55
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
56
+ module.weight.data.normal_(mean=0.0, std=std)
57
+ if module.bias is not None:
58
+ module.bias.data.zero_()
59
+ elif isinstance(module, nn.Embedding):
60
+ module.weight.data.normal_(mean=0.0, std=std)
61
+ if module.padding_idx is not None:
62
+ module.weight.data[module.padding_idx].zero_()
63
+
64
+
65
+ @dataclass
66
+ @auto_docstring(
67
+ custom_intro="""
68
+ Base class for ColQwen3 embeddings output.
69
+ """
70
+ )
71
+ class ColQwen3ForRetrievalOutput(ModelOutput):
72
+ r"""
73
+ embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
74
+ The embeddings of the model.
75
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
76
+ It is a [`~cache_utils.Cache`] instance.
77
+ """
78
+
79
+ loss: Optional[torch.FloatTensor] = None
80
+ embeddings: Optional[torch.Tensor] = None
81
+ past_key_values: Optional[Cache] = None
82
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
83
+ attentions: Optional[tuple[torch.FloatTensor]] = None
84
+
85
+
86
+ @auto_docstring(
87
+ custom_intro="""
88
+ ColQwen3 retrieval model that mirrors the ColQwen2 late-interaction pipeline while using a Qwen3-VL backbone.
89
+ """
90
+ )
91
+ class ColQwen3(ColQwen3PreTrainedModel):
92
+ _checkpoint_conversion_mapping = {
93
+ # Legacy checkpoints saved from a bare Qwen3VLModel (no `vlm.` nesting).
94
+ r"^model\.visual": "vlm.model.visual",
95
+ r"^model\.language_model": "vlm.model.language_model",
96
+ r"^model\.": "vlm.model.",
97
+ r"^visual": "vlm.model.visual",
98
+ r"^language_model": "vlm.model.language_model",
99
+ r"^custom_text_proj": "embedding_proj_layer",
100
+ }
101
+ config_class = ColQwen3Config
102
+ model_type = ColQwen3Config.model_type
103
+
104
+ def __init__(
105
+ self,
106
+ config: ColQwen3Config,
107
+ attn_impl: Optional[str] = None,
108
+ mask_non_image_embeddings: bool = False,
109
+ ):
110
+ """
111
+ Args:
112
+ config (ColQwen3Config): Configuration carrying nested vision/text configs for the retrieval model.
113
+ attn_impl (Optional[str], optional): Attention implementation forwarded to the VLM (e.g., "flash_attention_2"). Defaults to None.
114
+ mask_non_image_embeddings (bool, optional): If True, zero out non-image embeddings after projection. Defaults to False.
115
+ """
116
+ super().__init__(config)
117
+ self.config = config
118
+
119
+ vision_cfg = (
120
+ config.vision_config.to_dict() if isinstance(config.vision_config, PretrainedConfig) else config.vision_config
121
+ )
122
+ text_cfg = config.text_config.to_dict() if isinstance(config.text_config, PretrainedConfig) else config.text_config
123
+
124
+ vlm_config = Qwen3VLConfig(
125
+ text_config=text_cfg,
126
+ vision_config=vision_cfg,
127
+ image_token_id=getattr(config, "image_token_id", 151655),
128
+ video_token_id=getattr(config, "video_token_id", 151656),
129
+ vision_start_token_id=getattr(config, "vision_start_token_id", 151652),
130
+ vision_end_token_id=getattr(config, "vision_end_token_id", 151653),
131
+ tie_word_embeddings=getattr(config.text_config, "tie_word_embeddings", False),
132
+ )
133
+ self.vlm = AutoModelForImageTextToText.from_config(vlm_config)
134
+
135
+ self.embedding_dim = self.config.embed_dim
136
+ self.embedding_proj_layer = nn.Linear(
137
+ self.vlm.config.text_config.hidden_size,
138
+ self.embedding_dim,
139
+ )
140
+ self.padding_side = getattr(config, "padding_side", "left")
141
+ self.mask_non_image_embeddings = mask_non_image_embeddings
142
+ self._tied_weights_keys = [f"vlm.{k}" for k in (self.vlm._tied_weights_keys or [])]
143
+
144
+ self.post_init()
145
+
146
+ if attn_impl is not None and hasattr(self.vlm, "set_attn_implementation"):
147
+ self.vlm.set_attn_implementation(attn_impl)
148
+
149
+ @classmethod
150
+ def from_pretrained(cls, *args, config: Optional[ColQwen3Config] = None, **kwargs):
151
+ key_mapping = kwargs.pop("key_mapping", None)
152
+ if key_mapping is None:
153
+ key_mapping = getattr(cls, "_checkpoint_conversion_mapping", None)
154
+
155
+ return super().from_pretrained(*args, config=config, **kwargs, key_mapping=key_mapping)
156
+
157
+ @can_return_tuple
158
+ @auto_docstring
159
+ def forward(
160
+ self,
161
+ input_ids: Optional[torch.LongTensor] = None,
162
+ attention_mask: Optional[torch.Tensor] = None,
163
+ position_ids: Optional[torch.LongTensor] = None,
164
+ past_key_values: Optional[Cache] = None,
165
+ inputs_embeds: Optional[torch.FloatTensor] = None,
166
+ labels: Optional[torch.LongTensor] = None,
167
+ use_cache: Optional[bool] = None,
168
+ output_attentions: Optional[bool] = None,
169
+ output_hidden_states: Optional[bool] = None,
170
+ return_dict: Optional[bool] = None,
171
+ pixel_values: Optional[torch.Tensor] = None,
172
+ image_grid_thw: Optional[torch.LongTensor] = None,
173
+ cache_position: Optional[torch.LongTensor] = None,
174
+ pixel_values_videos: Optional[torch.Tensor] = None,
175
+ video_grid_thw: Optional[torch.LongTensor] = None,
176
+ ) -> ColQwen3ForRetrievalOutput:
177
+ r"""
178
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
179
+ The temporal, height and width of feature shape of each image in LLM.
180
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
181
+ The temporal, height and width of feature shape of each video in LLM.
182
+ """
183
+ if pixel_values is not None and image_grid_thw is not None:
184
+ offsets = image_grid_thw[:, 1] * image_grid_thw[:, 2]
185
+ pixel_values = torch.cat(
186
+ [pixel_sequence[:offset] for pixel_sequence, offset in zip(pixel_values, offsets)],
187
+ dim=0,
188
+ )
189
+ if pixel_values_videos is not None and video_grid_thw is not None:
190
+ video_offsets = video_grid_thw[:, 0] * video_grid_thw[:, 1] * video_grid_thw[:, 2]
191
+ pixel_values_videos = torch.cat(
192
+ [pixel_sequence[:offset] for pixel_sequence, offset in zip(pixel_values_videos, video_offsets)],
193
+ dim=0,
194
+ )
195
+
196
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
197
+ output_hidden_states = (
198
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
199
+ )
200
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
201
+
202
+ vlm_output = self.vlm.model(
203
+ input_ids=input_ids,
204
+ position_ids=position_ids,
205
+ attention_mask=attention_mask,
206
+ past_key_values=past_key_values,
207
+ inputs_embeds=inputs_embeds,
208
+ pixel_values_videos=pixel_values_videos,
209
+ use_cache=use_cache,
210
+ output_attentions=output_attentions,
211
+ output_hidden_states=output_hidden_states,
212
+ return_dict=return_dict,
213
+ pixel_values=pixel_values,
214
+ image_grid_thw=image_grid_thw,
215
+ video_grid_thw=video_grid_thw,
216
+ cache_position=cache_position,
217
+ )
218
+
219
+ vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None
220
+
221
+ last_hidden_states = vlm_output[0]
222
+ proj_dtype = self.embedding_proj_layer.weight.dtype
223
+ embeddings = self.embedding_proj_layer(last_hidden_states.to(proj_dtype))
224
+
225
+ denom = embeddings.norm(dim=-1, keepdim=True).clamp_min(torch.finfo(embeddings.dtype).eps)
226
+ embeddings = embeddings / denom
227
+ if attention_mask is not None:
228
+ embeddings = embeddings * attention_mask.unsqueeze(-1)
229
+
230
+ if pixel_values is not None and self.mask_non_image_embeddings:
231
+ image_mask = (input_ids == self.vlm.config.image_token_id).unsqueeze(-1)
232
+ embeddings = embeddings * image_mask
233
+
234
+ return ColQwen3ForRetrievalOutput(
235
+ embeddings=embeddings,
236
+ past_key_values=vlm_output.past_key_values,
237
+ hidden_states=vlm_hidden_states,
238
+ attentions=vlm_output.attentions,
239
+ )
240
+
241
+ def get_input_embeddings(self):
242
+ return self.vlm.get_input_embeddings()
243
+
244
+ def set_input_embeddings(self, value):
245
+ self.vlm.set_input_embeddings(value)
246
+
247
+ def get_output_embeddings(self):
248
+ return self.vlm.get_output_embeddings()
249
+
250
+ def set_output_embeddings(self, new_embeddings):
251
+ self.vlm.set_output_embeddings(new_embeddings)
252
+
253
+ def tie_weights(self):
254
+ return self.vlm.tie_weights()
255
+
256
+ def resize_token_embeddings(
257
+ self,
258
+ new_num_tokens: Optional[int] = None,
259
+ pad_to_multiple_of: Optional[int] = None,
260
+ mean_resizing: bool = True,
261
+ ) -> nn.Embedding:
262
+ model_embeds = self.vlm.resize_token_embeddings(
263
+ new_num_tokens=new_num_tokens,
264
+ pad_to_multiple_of=pad_to_multiple_of,
265
+ mean_resizing=mean_resizing,
266
+ )
267
+
268
+ self.vlm.config.text_config.vocab_size = model_embeds.num_embeddings
269
+ self.vlm.config.vocab_size = model_embeds.num_embeddings
270
+ return model_embeds
271
+
272
+
273
+ __all__ = ["ColQwen3", "ColQwen3PreTrainedModel", "ColQwen3ForRetrievalOutput"]
preprocessor_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_colqwen3.ColQwen3Processor"
4
+ },
5
+ "crop_size": null,
6
+ "data_format": "channels_first",
7
+ "default_to_square": true,
8
+ "device": null,
9
+ "disable_grouping": null,
10
+ "do_center_crop": null,
11
+ "do_convert_rgb": true,
12
+ "do_normalize": true,
13
+ "do_pad": null,
14
+ "do_rescale": true,
15
+ "do_resize": true,
16
+ "image_mean": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "image_processor_type": "Qwen2VLImageProcessorFast",
22
+ "image_std": [
23
+ 0.5,
24
+ 0.5,
25
+ 0.5
26
+ ],
27
+ "input_data_format": null,
28
+ "max_pixels": 1310720,
29
+ "merge_size": 2,
30
+ "min_pixels": null,
31
+ "pad_size": null,
32
+ "patch_size": 16,
33
+ "processor_class": "ColQwen3Processor",
34
+ "resample": 3,
35
+ "rescale_factor": 0.00392156862745098,
36
+ "return_tensors": null,
37
+ "size": {
38
+ "longest_edge": 1310720,
39
+ "shortest_edge": 65536
40
+ },
41
+ "temporal_patch_size": 2
42
+ }
processing_colqwen3.py ADDED
@@ -0,0 +1,815 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # Copyright 2025 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ Processing utilities for ColQwen3, aligned with the ColQwen2 reference implementation.
17
+ """
18
+
19
+ import importlib
20
+ import numpy as np
21
+ from typing import Any, ClassVar, List, Optional, Tuple, Union
22
+
23
+ import torch
24
+ from PIL import Image
25
+ from transformers import BatchEncoding
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import AudioInput, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideoInput
29
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
30
+ from transformers.utils import logging
31
+
32
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+ try:
37
+ from fast_plaid import search
38
+ except ImportError:
39
+ logger.info(
40
+ "FastPlaid is not installed.If you want to use it:Instal with `pip install --no-deps fast-plaid fastkmeans`"
41
+ )
42
+
43
+
44
+ def get_torch_device(device: str = "auto") -> str:
45
+ """Resolve a torch device string with a simple auto mode."""
46
+ if device == "auto":
47
+ if torch.cuda.is_available():
48
+ device = "cuda:0"
49
+ elif torch.backends.mps.is_available(): # for Apple Silicon
50
+ device = "mps"
51
+ else:
52
+ device = "cpu"
53
+ return device
54
+
55
+
56
+ class ColQwen3ProcessorKwargs(ProcessingKwargs, total=False):
57
+ _defaults = {
58
+ "text_kwargs": {
59
+ "padding": "longest",
60
+ },
61
+ "images_kwargs": {
62
+ "data_format": "channels_first",
63
+ "do_convert_rgb": True,
64
+ },
65
+ "videos_kwargs": {
66
+ "return_metadata": True,
67
+ "data_format": "channels_first",
68
+ "do_convert_rgb": True,
69
+ },
70
+ "common_kwargs": {"return_tensors": "pt"},
71
+ }
72
+
73
+
74
+ class ColQwen3Processor(ProcessorMixin):
75
+ """
76
+ Constructs a ColQwen3 processor which wraps a Qwen3VLProcessor with retrieval-specific helpers.
77
+ """
78
+
79
+ attributes = ["image_processor", "tokenizer", "video_processor"]
80
+ image_processor_class = "AutoImageProcessor"
81
+ video_processor_class = "AutoVideoProcessor"
82
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
83
+
84
+ def __init__(
85
+ self,
86
+ image_processor=None,
87
+ tokenizer=None,
88
+ video_processor=None,
89
+ chat_template=None,
90
+ visual_prompt_prefix: Optional[str] = None,
91
+ video_prompt_prefix: Optional[str] = None,
92
+ query_prefix: Optional[str] = None,
93
+ **kwargs,
94
+ ):
95
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)
96
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
97
+ self.image_token_id = (
98
+ tokenizer.image_token_id
99
+ if getattr(tokenizer, "image_token_id", None)
100
+ else tokenizer.convert_tokens_to_ids(self.image_token)
101
+ )
102
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
103
+ self.video_token_id = (
104
+ tokenizer.video_token_id
105
+ if getattr(tokenizer, "video_token_id", None)
106
+ else tokenizer.convert_tokens_to_ids(self.video_token)
107
+ )
108
+ self.vision_start_token = (
109
+ "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
110
+ )
111
+ self.vision_end_token = (
112
+ "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
113
+ )
114
+ self.vision_start_token_id = (
115
+ tokenizer.vision_start_token_id
116
+ if getattr(tokenizer, "vision_start_token_id", None)
117
+ else tokenizer.convert_tokens_to_ids(self.vision_start_token)
118
+ )
119
+ self.vision_end_token_id = (
120
+ tokenizer.vision_end_token_id
121
+ if getattr(tokenizer, "vision_end_token_id", None)
122
+ else tokenizer.convert_tokens_to_ids(self.vision_end_token)
123
+ )
124
+
125
+ if visual_prompt_prefix is None:
126
+ visual_prompt_prefix = (
127
+ "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"
128
+ )
129
+ self.visual_prompt_prefix = visual_prompt_prefix
130
+
131
+ if video_prompt_prefix is None:
132
+ video_prompt_prefix = (
133
+ "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>Describe the video.<|im_end|><|endoftext|>"
134
+ )
135
+ self.video_prompt_prefix = video_prompt_prefix
136
+
137
+ if query_prefix is None:
138
+ query_prefix = ""
139
+ self.query_prefix = query_prefix
140
+ self.tokenizer.padding_side = "left"
141
+
142
+ @classmethod
143
+ def from_pretrained( # type: ignore[override]
144
+ cls,
145
+ *args: Any,
146
+ max_num_visual_tokens: int = 1280,
147
+ **kwargs: Any,
148
+ ) -> "ColQwen3Processor":
149
+ instance = super().from_pretrained(
150
+ *args,
151
+ **kwargs,
152
+ )
153
+
154
+ patch_size = getattr(instance.image_processor, "patch_size", None)
155
+ merge_size = getattr(instance.image_processor, "merge_size", None) or getattr(
156
+ instance.image_processor, "spatial_merge_size", None
157
+ )
158
+ if patch_size is None or merge_size is None:
159
+ raise ValueError("Qwen3VL image processor is missing `patch_size` or `merge_size`/`spatial_merge_size`.")
160
+ tile = patch_size * merge_size
161
+ instance.image_processor.max_pixels = max_num_visual_tokens * tile * tile
162
+ instance.image_processor.size["longest_edge"] = instance.image_processor.max_pixels
163
+
164
+ return instance
165
+
166
+ def __call__(
167
+ self,
168
+ images: Optional[ImageInput] = None,
169
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
170
+ audio: Optional[AudioInput] = None,
171
+ videos: Optional[VideoInput] = None,
172
+ **kwargs: Unpack[ColQwen3ProcessorKwargs],
173
+ ) -> BatchFeature:
174
+ output_kwargs = self._merge_kwargs(
175
+ ColQwen3ProcessorKwargs,
176
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
177
+ **kwargs,
178
+ )
179
+ suffix = output_kwargs["text_kwargs"].pop("suffix", None)
180
+
181
+ return_token_type_ids = suffix is not None
182
+
183
+ if text is None and images is None and videos is None:
184
+ raise ValueError("Either text, images or videos must be provided")
185
+ if images is not None and videos is not None:
186
+ raise ValueError("Provide only one of `images` or `videos`, not both.")
187
+
188
+ image_list: Optional[list[Any]] = None
189
+ video_list: Optional[list[Any]] = None
190
+ text_list: Optional[list[str]] = None
191
+
192
+ if images is not None:
193
+ if is_valid_image(images):
194
+ image_list = [images]
195
+ elif isinstance(images, list):
196
+ image_list = images
197
+ else:
198
+ raise ValueError("images must be an image, list of images or list of list of images")
199
+
200
+ if videos is not None:
201
+ if isinstance(videos, list) or isinstance(videos, tuple):
202
+ video_list = list(videos)
203
+ else:
204
+ video_list = [videos]
205
+
206
+ if text is not None:
207
+ if isinstance(text, str):
208
+ text_list = [text]
209
+ elif isinstance(text, list):
210
+ if len(text) == 0 or not all(isinstance(t, (str, type(None))) for t in text):
211
+ raise ValueError("Text must be a string or a list of strings.")
212
+ text_list = text
213
+ else:
214
+ raise ValueError("Text must be a string or a list of strings")
215
+
216
+ if image_list is None and video_list is None and text_list is None:
217
+ raise ValueError("Either text, images or videos must be provided")
218
+
219
+ # Text-only batch
220
+ if image_list is None and video_list is None:
221
+ return self._encode_texts(text_list or [], suffix, output_kwargs)
222
+
223
+ # Image-only batch
224
+ if image_list is not None and video_list is None and text_list is None:
225
+ return self._encode_images(image_list, None, output_kwargs, return_token_type_ids)
226
+
227
+ # Video-only batch
228
+ if video_list is not None and image_list is None and text_list is None:
229
+ return self._encode_videos(video_list, None, output_kwargs, return_token_type_ids)
230
+
231
+ # Mixed image+text batch
232
+ if image_list is not None:
233
+ if len(image_list) != len(text_list):
234
+ raise ValueError("When providing both images and text, their lengths must match.")
235
+
236
+ paired_images: list[Any] = []
237
+ paired_texts: list[str] = []
238
+ for img, txt in zip(image_list, text_list):
239
+ if img is None or not is_valid_image(img):
240
+ raise ValueError("When providing both images and text, each item must include a valid image.")
241
+ paired_images.append(img)
242
+ paired_texts.append(txt or "")
243
+
244
+ encoded_pairs = self._encode_image_text_pairs(
245
+ paired_images, paired_texts, output_kwargs, return_token_type_ids
246
+ )
247
+ return encoded_pairs
248
+
249
+ # Mixed video+text batch
250
+ if video_list is not None:
251
+ if len(video_list) != len(text_list):
252
+ raise ValueError("When providing both videos and text, their lengths must match.")
253
+
254
+ paired_videos: list[Any] = []
255
+ paired_texts: list[str] = []
256
+ for vid, txt in zip(video_list, text_list):
257
+ if vid is None:
258
+ raise ValueError("When providing both videos and text, each item must include a valid video.")
259
+ paired_videos.append(vid)
260
+ paired_texts.append(txt or "")
261
+
262
+ return self._encode_video_text_pairs(paired_videos, paired_texts, output_kwargs, return_token_type_ids)
263
+
264
+ raise ValueError("Unsupported input combination.")
265
+
266
+ def process_images(
267
+ self,
268
+ images: List[Image.Image],
269
+ ) -> Union[BatchFeature, BatchEncoding]:
270
+ images = [image.convert("RGB") for image in images]
271
+ return self(images=images, padding="longest", return_tensors="pt")
272
+
273
+ def process_texts(self, texts: List[str]) -> Union[BatchFeature, BatchEncoding]:
274
+ return self(text=texts, return_tensors="pt", padding="longest")
275
+
276
+ def _encode_images(
277
+ self,
278
+ images: List[Any],
279
+ texts: Optional[List[str]],
280
+ output_kwargs: dict[str, Any],
281
+ return_token_type_ids: bool,
282
+ ) -> BatchFeature:
283
+ if not images:
284
+ raise ValueError("No images provided for encoding.")
285
+
286
+ if not (isinstance(images, list) and (is_valid_image(images[0]) or is_valid_image(images[0][0]))):
287
+ raise ValueError("images must be an image, list of images or list of list of images")
288
+
289
+ if texts is None:
290
+ texts = ["" for _ in range(len(images))]
291
+ elif len(texts) not in (1, len(images)):
292
+ raise ValueError("Length of `text` must either be 1 or match the number of images.")
293
+ elif len(texts) == 1 and len(images) > 1:
294
+ texts = texts * len(images)
295
+
296
+ # Convert images to RGB
297
+ if isinstance(images[0], list):
298
+ # List of list of images
299
+ images = [[img.convert("RGB") for img in img_list] for img_list in images]
300
+ else:
301
+ # List of images
302
+ images = [img.convert("RGB") for img in images]
303
+
304
+ texts_doc = []
305
+ for extra_text in texts:
306
+ extra_text = (extra_text or "").strip()
307
+ if extra_text:
308
+ prompt = self.visual_prompt_prefix.replace("Describe the image.", f"Describe the image. {extra_text}")
309
+ else:
310
+ prompt = self.visual_prompt_prefix
311
+ texts_doc.append(prompt)
312
+
313
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
314
+ image_grid_thw = image_inputs["image_grid_thw"]
315
+
316
+ if image_grid_thw is not None:
317
+ merge_size = getattr(self.image_processor, "merge_size", None) or getattr(
318
+ self.image_processor, "spatial_merge_size", None
319
+ )
320
+ if merge_size is None:
321
+ raise ValueError("Qwen3VL image processor is missing `merge_size`/`spatial_merge_size`.")
322
+ merge_length = merge_size**2
323
+ index = 0
324
+ for i in range(len(texts_doc)):
325
+ while self.image_token in texts_doc[i]:
326
+ texts_doc[i] = texts_doc[i].replace(
327
+ self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
328
+ )
329
+ index += 1
330
+ texts_doc[i] = texts_doc[i].replace("<|placeholder|>", self.image_token)
331
+
332
+ text_inputs = self.tokenizer(
333
+ texts_doc,
334
+ return_token_type_ids=False,
335
+ **output_kwargs["text_kwargs"],
336
+ )
337
+
338
+ return_data = BatchFeature(data={**text_inputs, **image_inputs})
339
+
340
+ offsets = return_data["image_grid_thw"][:, 1] * return_data["image_grid_thw"][:, 2]
341
+
342
+ pixel_values = list(
343
+ torch.split(return_data["pixel_values"], offsets.tolist())
344
+ ) # [(num_patches_image_0, pixel_values), ..., (num_patches_image_n, pixel_values)]
345
+
346
+ return_data["pixel_values"] = torch.nn.utils.rnn.pad_sequence(
347
+ pixel_values, batch_first=True
348
+ ) # (batch_size, max_num_patches, pixel_values)
349
+
350
+ if return_token_type_ids:
351
+ labels = return_data["input_ids"].masked_fill(return_data["token_type_ids"] == 0, -100)
352
+ return_data.update({"labels": labels})
353
+
354
+ return return_data
355
+
356
+ def _encode_videos(
357
+ self,
358
+ videos: List[Any],
359
+ texts: Optional[List[str]],
360
+ output_kwargs: dict[str, Any],
361
+ return_token_type_ids: bool,
362
+ ) -> BatchFeature:
363
+ if not videos:
364
+ raise ValueError("No videos provided for encoding.")
365
+
366
+ if texts is None:
367
+ texts = ["" for _ in range(len(videos))]
368
+ elif len(texts) not in (1, len(videos)):
369
+ raise ValueError("Length of `text` must either be 1 or match the number of videos.")
370
+ elif len(texts) == 1 and len(videos) > 1:
371
+ texts = texts * len(videos)
372
+
373
+ texts_doc = []
374
+ for extra_text in texts:
375
+ extra_text = (extra_text or "").strip()
376
+ if extra_text:
377
+ prompt = self.video_prompt_prefix.replace("Describe the video.", f"Describe the video. {extra_text}")
378
+ else:
379
+ prompt = self.video_prompt_prefix
380
+ texts_doc.append(prompt)
381
+
382
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
383
+ video_grid_thw = videos_inputs["video_grid_thw"]
384
+
385
+ if "video_metadata" not in videos_inputs:
386
+ raise ValueError(
387
+ "Video metadata is required to build video prompts. Please set `return_metadata=True` "
388
+ "when calling the processor."
389
+ )
390
+ if "return_metadata" not in output_kwargs["videos_kwargs"]:
391
+ video_metadata = videos_inputs.pop("video_metadata")
392
+ else:
393
+ video_metadata = videos_inputs["video_metadata"]
394
+
395
+ merge_size = getattr(self.video_processor, "merge_size", None)
396
+ if merge_size is None:
397
+ raise ValueError("Qwen3VL video processor is missing `merge_size`.")
398
+ merge_length = merge_size**2
399
+
400
+ index = 0
401
+ for i in range(len(texts_doc)):
402
+ while self.video_token in texts_doc[i]:
403
+ metadata = video_metadata[index]
404
+ if metadata.fps is None:
405
+ logger.warning_once(
406
+ "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could "
407
+ "not be inferred. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
408
+ )
409
+ metadata.fps = 24 if metadata.fps is None else metadata.fps
410
+
411
+ curr_timestamp = self._calculate_timestamps(
412
+ metadata.frames_indices, metadata.fps, self.video_processor.merge_size
413
+ )
414
+ frame_seqlen = int(video_grid_thw[index][1:].prod().item() // merge_length)
415
+ video_placeholder = ""
416
+ for frame_idx in range(int(video_grid_thw[index][0])):
417
+ curr_time = curr_timestamp[frame_idx]
418
+ video_placeholder += f"<{curr_time:.1f} seconds>"
419
+ video_placeholder += (
420
+ self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
421
+ )
422
+
423
+ if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in texts_doc[i]:
424
+ texts_doc[i] = texts_doc[i].replace(
425
+ f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
426
+ )
427
+ else:
428
+ texts_doc[i] = texts_doc[i].replace(self.video_token, video_placeholder, 1)
429
+ index += 1
430
+ texts_doc[i] = texts_doc[i].replace("<|placeholder|>", self.video_token)
431
+
432
+ text_inputs = self.tokenizer(
433
+ texts_doc,
434
+ return_token_type_ids=False,
435
+ **output_kwargs["text_kwargs"],
436
+ )
437
+
438
+ return_data = BatchFeature(data={**text_inputs, **videos_inputs})
439
+
440
+ offsets = video_grid_thw[:, 0] * video_grid_thw[:, 1] * video_grid_thw[:, 2]
441
+ pixel_values_videos = [
442
+ return_data["pixel_values_videos"][idx, : offset.item()] for idx, offset in enumerate(offsets)
443
+ ]
444
+ return_data["pixel_values_videos"] = torch.nn.utils.rnn.pad_sequence(
445
+ pixel_values_videos, batch_first=True
446
+ )
447
+
448
+ if return_token_type_ids:
449
+ labels = return_data["input_ids"].masked_fill(return_data["token_type_ids"] == 0, -100)
450
+ return_data.update({"labels": labels})
451
+
452
+ return return_data
453
+
454
+ def _encode_texts(
455
+ self,
456
+ texts: List[str],
457
+ suffix: Optional[str],
458
+ output_kwargs: dict[str, Any],
459
+ ) -> BatchFeature:
460
+ if not texts:
461
+ raise ValueError("No texts provided for encoding.")
462
+
463
+ if suffix is None:
464
+ suffix = self.query_augmentation_token * 10
465
+
466
+ texts_query: list[str] = []
467
+
468
+ for query in texts:
469
+ augmented_query = self.query_prefix + query + suffix
470
+ texts_query.append(augmented_query)
471
+
472
+ batch_query = self.tokenizer(
473
+ texts_query,
474
+ return_token_type_ids=False,
475
+ **output_kwargs["text_kwargs"],
476
+ )
477
+
478
+ return batch_query
479
+
480
+ def _encode_image_text_pairs(
481
+ self,
482
+ images: List[Any],
483
+ texts: List[str],
484
+ output_kwargs: dict[str, Any],
485
+ return_token_type_ids: bool,
486
+ ) -> BatchFeature:
487
+ if len(images) != len(texts):
488
+ raise ValueError("`images` and `text` must have the same length for paired encoding.")
489
+ if any(img is None for img in images):
490
+ raise ValueError("All paired items must include an image.")
491
+
492
+ if any(isinstance(img, list) for img in images):
493
+ raise ValueError("Only one image is allowed per paired item.")
494
+
495
+ return self._encode_images(images, texts, output_kwargs, return_token_type_ids)
496
+
497
+ def _encode_video_text_pairs(
498
+ self,
499
+ videos: List[Any],
500
+ texts: List[str],
501
+ output_kwargs: dict[str, Any],
502
+ return_token_type_ids: bool,
503
+ ) -> BatchFeature:
504
+ if len(videos) != len(texts):
505
+ raise ValueError("`videos` and `text` must have the same length for paired encoding.")
506
+ if any(vid is None for vid in videos):
507
+ raise ValueError("All paired items must include a video.")
508
+
509
+ if any(isinstance(vid, list) for vid in videos):
510
+ raise ValueError("Only one video is allowed per paired item.")
511
+
512
+ return self._encode_videos(videos, texts, output_kwargs, return_token_type_ids)
513
+
514
+ @staticmethod
515
+ def _split_batch_feature(batch_feature: BatchFeature) -> list[BatchFeature]:
516
+ # Split a batched BatchFeature into a list of per-item BatchFeatures.
517
+ length: Optional[int] = None
518
+ for value in batch_feature.values():
519
+ if hasattr(value, "__len__"):
520
+ try:
521
+ length = len(value)
522
+ except Exception:
523
+ continue
524
+ if length is not None:
525
+ break
526
+
527
+ if length is None:
528
+ return [batch_feature]
529
+
530
+ items: list[BatchFeature] = []
531
+ for idx in range(length):
532
+ data = {}
533
+ for key, value in batch_feature.items():
534
+ try:
535
+ data[key] = value[idx]
536
+ except Exception:
537
+ data[key] = value
538
+ items.append(BatchFeature(data=data))
539
+ return items
540
+
541
+ @staticmethod
542
+ def _merge_batch_features(features: list[BatchFeature]) -> BatchFeature:
543
+ if not features:
544
+ return BatchFeature()
545
+
546
+ all_keys = set()
547
+ for feat in features:
548
+ all_keys.update(feat.keys())
549
+
550
+ merged: dict[str, list[Any]] = {key: [] for key in all_keys}
551
+ for feat in features:
552
+ for key in all_keys:
553
+ merged[key].append(feat.get(key))
554
+
555
+ combined: dict[str, Any] = {}
556
+ for key, values in merged.items():
557
+ # Prefer stacking tensors so callers get batched tensors instead of lists
558
+ if all(isinstance(v, torch.Tensor) for v in values):
559
+ try:
560
+ combined[key] = torch.stack(values)
561
+ continue
562
+ except Exception:
563
+ # Fallback to list if shapes are incompatible for stacking
564
+ pass
565
+ combined[key] = values
566
+
567
+ return BatchFeature(data=combined)
568
+
569
+ def score_retrieval(
570
+ self,
571
+ qs: List[torch.Tensor],
572
+ ps: List[torch.Tensor],
573
+ score_batch_size: int = 128,
574
+ device: Optional[Union[str, torch.device]] = None,
575
+ **kwargs,
576
+ ) -> torch.Tensor:
577
+ return self.score_multi_vector(qs, ps, batch_size=score_batch_size, device=device, **kwargs)
578
+
579
+ @staticmethod
580
+ def score_single_vector(
581
+ qs: Union[torch.Tensor, List[torch.Tensor]],
582
+ ps: Union[torch.Tensor, List[torch.Tensor]],
583
+ device: Optional[Union[str, torch.device]] = None,
584
+ ) -> torch.Tensor:
585
+ """
586
+ Compute the dot product score for the given single-vector query and passage embeddings.
587
+ """
588
+ device = device or get_torch_device("auto")
589
+
590
+ if isinstance(qs, list) and isinstance(ps, list):
591
+ if len(qs) == 0:
592
+ raise ValueError("No queries provided")
593
+ if len(ps) == 0:
594
+ raise ValueError("No passages provided")
595
+
596
+ qs = torch.stack(qs).to(device)
597
+ ps = torch.stack(ps).to(device)
598
+ else:
599
+ qs = qs.to(device)
600
+ ps = ps.to(device)
601
+
602
+ scores = torch.einsum("bd,cd->bc", qs, ps)
603
+ assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
604
+
605
+ scores = scores.to(torch.float32)
606
+ return scores
607
+
608
+ @staticmethod
609
+ def score_multi_vector(
610
+ qs: Union[torch.Tensor, List[torch.Tensor]],
611
+ ps: Union[torch.Tensor, List[torch.Tensor]],
612
+ batch_size: int = 128,
613
+ device: Optional[Union[str, torch.device]] = None,
614
+ ) -> torch.Tensor:
615
+ """
616
+ Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
617
+ query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
618
+ image of a document page.
619
+
620
+ Because the embedding tensors are multi-vector and can thus have different shapes, they
621
+ should be fed as:
622
+ (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
623
+ (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
624
+ obtained by padding the list of tensors.
625
+
626
+ Args:
627
+ qs (`Union[torch.Tensor, List[torch.Tensor]`): Query embeddings.
628
+ ps (`Union[torch.Tensor, List[torch.Tensor]`): Passage embeddings.
629
+ batch_size (`int`, *optional*): Batch size for computing scores.
630
+ device (`Union[str, torch.device]`, *optional*): Device to use for computation. If not
631
+ provided, uses `get_torch_device("auto")`.
632
+
633
+ Returns:
634
+ `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
635
+ tensor is saved on the "cpu" device.
636
+ """
637
+ device = device or get_torch_device("auto")
638
+
639
+ if len(qs) == 0:
640
+ raise ValueError("No queries provided")
641
+ if len(ps) == 0:
642
+ raise ValueError("No passages provided")
643
+
644
+ scores_list: List[torch.Tensor] = []
645
+
646
+ for i in range(0, len(qs), batch_size):
647
+ scores_batch = []
648
+ qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
649
+ device
650
+ )
651
+ for j in range(0, len(ps), batch_size):
652
+ ps_batch = torch.nn.utils.rnn.pad_sequence(
653
+ ps[j : j + batch_size], batch_first=True, padding_value=0
654
+ ).to(device)
655
+ scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
656
+ scores_batch = torch.cat(scores_batch, dim=1).cpu()
657
+ scores_list.append(scores_batch)
658
+
659
+ scores = torch.cat(scores_list, dim=0)
660
+ assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
661
+
662
+ scores = scores.to(torch.float32)
663
+ return scores
664
+
665
+ @staticmethod
666
+ def get_topk_plaid(
667
+ qs: Union[torch.Tensor, List[torch.Tensor]],
668
+ plaid_index: "search.FastPlaid",
669
+ k: int = 10,
670
+ batch_size: int = 128,
671
+ device: Optional[Union[str, torch.device]] = None,
672
+ ) -> torch.Tensor:
673
+ """
674
+ Experimental: Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
675
+ query embeddings (`qs`) and passage embeddings endoded in a plaid index. For ColPali, a passage is the
676
+ image of a document page.
677
+ """
678
+ device = device or get_torch_device("auto")
679
+
680
+ if len(qs) == 0:
681
+ raise ValueError("No queries provided")
682
+
683
+ scores_list: List[torch.Tensor] = []
684
+
685
+ for i in range(0, len(qs), batch_size):
686
+ scores_batch = []
687
+ qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
688
+ device
689
+ )
690
+ scores_batch = plaid_index.search(
691
+ queries_embeddings=qs_batch.to(torch.float32),
692
+ top_k=k,
693
+ )
694
+ scores_list.append(scores_batch)
695
+
696
+ return scores_list
697
+
698
+ @staticmethod
699
+ def create_plaid_index(
700
+ ps: Union[torch.Tensor, List[torch.Tensor]],
701
+ device: Optional[Union[str, torch.device]] = None,
702
+ ) -> torch.Tensor:
703
+ """
704
+ Experimental: Create a FastPlaid index from the given passage embeddings.
705
+ Args:
706
+ ps (`Union[torch.Tensor, List[torch.Tensor]]`): Passage embeddings. Should be a list of tensors,
707
+ where each tensor is of shape (sequence_length_i, embedding_dim).
708
+ device (`Optional[Union[str, torch.device]]`, *optional*): Device to use for computation. If not
709
+ provided, uses `get_torch_device("auto")`.
710
+ """
711
+ if not importlib.util.find_spec("fast_plaid"):
712
+ raise ImportError("FastPlaid is not installed. Please install it with `pip install fast-plaid`.")
713
+
714
+ fast_plaid_index = search.FastPlaid(index="index")
715
+ device = device or get_torch_device("auto")
716
+ fast_plaid_index.create(documents_embeddings=[d.to(device).to(torch.float32) for d in ps])
717
+ return fast_plaid_index
718
+
719
+ def get_n_patches(
720
+ self,
721
+ image_size: Tuple[int, int],
722
+ spatial_merge_size: int,
723
+ ) -> Tuple[int, int]:
724
+ """
725
+ Get the number of patches (n_patches_x, n_patches_y) that will be used to process an image of
726
+ size (height, width) with the given patch size.
727
+
728
+ The `spatial_merge_size` is the number of patches that will be merged spatially. It is stored in
729
+ as a `Qwen2VLForConditionalGeneration` attribute under `model.spatial_merge_size`.
730
+ """
731
+ patch_size = self.image_processor.patch_size
732
+
733
+ height_new, width_new = smart_resize(
734
+ width=image_size[0],
735
+ height=image_size[1],
736
+ factor=patch_size * self.image_processor.merge_size,
737
+ min_pixels=self.image_processor.size["shortest_edge"],
738
+ max_pixels=self.image_processor.size["longest_edge"],
739
+ )
740
+
741
+ n_patches_x = width_new // patch_size // spatial_merge_size
742
+ n_patches_y = height_new // patch_size // spatial_merge_size
743
+
744
+ return n_patches_x, n_patches_y
745
+
746
+ def get_image_mask(self, batch_images: BatchFeature) -> torch.Tensor:
747
+ return batch_images.input_ids == self.image_token_id
748
+
749
+ def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
750
+ vision_data = {}
751
+ if image_sizes is not None:
752
+ images_kwargs = ColQwen3ProcessorKwargs._defaults.get("images_kwargs", {})
753
+ images_kwargs.update(kwargs)
754
+ merge_size = images_kwargs.get("merge_size", None) or getattr(
755
+ self.image_processor, "merge_size", None
756
+ ) or getattr(self.image_processor, "spatial_merge_size", None)
757
+ if merge_size is None:
758
+ raise ValueError("Qwen3VL image processor is missing `merge_size`/`spatial_merge_size`.")
759
+
760
+ num_image_patches = [
761
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
762
+ for image_size in image_sizes
763
+ ]
764
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
765
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
766
+
767
+ video_sizes = kwargs.pop("video_sizes", None)
768
+ if video_sizes is not None:
769
+ videos_kwargs = ColQwen3ProcessorKwargs._defaults.get("videos_kwargs", {})
770
+ videos_kwargs.update(kwargs)
771
+ merge_size = videos_kwargs.get("merge_size", None) or getattr(self.video_processor, "merge_size", None)
772
+ if merge_size is None:
773
+ raise ValueError("Qwen3VL video processor is missing `merge_size`.")
774
+
775
+ num_video_patches = [
776
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs) for video_size in video_sizes
777
+ ]
778
+ num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
779
+ vision_data.update({"num_video_tokens": num_video_tokens, "num_video_patches": num_video_patches})
780
+
781
+ return MultiModalData(**vision_data)
782
+
783
+ @property
784
+ def model_input_names(self) -> list[str]:
785
+ return [
786
+ "input_ids",
787
+ "attention_mask",
788
+ "pixel_values",
789
+ "image_grid_thw",
790
+ "pixel_values_videos",
791
+ "video_grid_thw",
792
+ ]
793
+
794
+ @property
795
+ def query_augmentation_token(self) -> str:
796
+ return self.tokenizer.pad_token
797
+
798
+ def get_video_mask(self, batch_videos: BatchFeature) -> torch.Tensor:
799
+ return batch_videos.input_ids == self.video_token_id
800
+
801
+ def _calculate_timestamps(
802
+ self, indices: Union[list[int], np.ndarray], video_fps: float, merge_size: int = 2
803
+ ) -> list[float]:
804
+ if not isinstance(indices, list):
805
+ indices = indices.tolist()
806
+ if len(indices) % merge_size != 0:
807
+ indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
808
+ timestamps = [idx / video_fps for idx in indices]
809
+ timestamps = [
810
+ (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
811
+ ]
812
+ return timestamps
813
+
814
+
815
+ __all__ = ["ColQwen3Processor", "ColQwen3ProcessorKwargs"]
processor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_colqwen3.ColQwen3Processor"
4
+ },
5
+ "processor_class": "ColQwen3Processor",
6
+ "query_prefix": "",
7
+ "video_prompt_prefix": "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>Describe the video.<|im_end|><|endoftext|>",
8
+ "visual_prompt_prefix": "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"
9
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "auto_map": {
230
+ "AutoProcessor": "processing_colqwen3.ColQwen3Processor"
231
+ },
232
+ "bos_token": null,
233
+ "clean_up_tokenization_spaces": false,
234
+ "eos_token": "<|im_end|>",
235
+ "errors": "replace",
236
+ "extra_special_tokens": {},
237
+ "model_max_length": 262144,
238
+ "pad_token": "<|endoftext|>",
239
+ "processor_class": "ColQwen3Processor",
240
+ "split_special_tokens": false,
241
+ "tokenizer_class": "Qwen2Tokenizer",
242
+ "unk_token": null
243
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_colqwen3.ColQwen3Processor"
4
+ },
5
+ "crop_size": null,
6
+ "data_format": "channels_first",
7
+ "default_to_square": true,
8
+ "device": null,
9
+ "do_center_crop": null,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "do_sample_frames": true,
15
+ "fps": 2,
16
+ "image_mean": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "image_processor_type": "Qwen2VLImageProcessorFast",
22
+ "image_std": [
23
+ 0.5,
24
+ 0.5,
25
+ 0.5
26
+ ],
27
+ "input_data_format": null,
28
+ "max_frames": 768,
29
+ "merge_size": 2,
30
+ "min_frames": 4,
31
+ "num_frames": null,
32
+ "pad_size": null,
33
+ "patch_size": 16,
34
+ "processor_class": "ColQwen3Processor",
35
+ "resample": 3,
36
+ "rescale_factor": 0.00392156862745098,
37
+ "return_metadata": false,
38
+ "size": {
39
+ "longest_edge": 16777216,
40
+ "shortest_edge": 65536
41
+ },
42
+ "temporal_patch_size": 2,
43
+ "video_metadata": null,
44
+ "video_processor_type": "Qwen3VLVideoProcessor"
45
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff