File size: 4,420 Bytes
5a4159a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
# Copyright 2025 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Configuration for ColQwen3, adapted to mirror the ColQwen2 structure.
"""

from copy import deepcopy
from typing import Any

from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto import CONFIG_MAPPING
from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig, Qwen3VLVisionConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class ColQwen3Config(PretrainedConfig):
    """Configuration for ColQwen3 retrieval model."""

    model_type = "colqwen3"
    sub_configs: dict[str, Any] = {"vision_config": Qwen3VLVisionConfig, "text_config": Qwen3VLTextConfig}

    def __init__(
        self,
        vision_config: Any = None,
        text_config: Any = None,
        embed_dim: int = 320,
        padding_side: str = "left",
        initializer_range: float = 0.02,
        dtype: str | None = None,
        **kwargs,
    ):
        if vision_config is None or text_config is None:
            base_vlm_config = CONFIG_MAPPING["qwen3_vl"]()
            if vision_config is None:
                vision_config = deepcopy(base_vlm_config.vision_config)
                logger.info("`vision_config` is `None`. Initializing with the default `Qwen3VLVisionConfig`.")
            if text_config is None:
                text_config = deepcopy(base_vlm_config.text_config)
                logger.info("`text_config` is `None`. Initializing with the default `Qwen3VLTextConfig`.")

        if isinstance(vision_config, dict):
            vision_config = Qwen3VLVisionConfig(**deepcopy(vision_config))
        elif not isinstance(vision_config, PretrainedConfig):
            raise TypeError(
                f"Invalid type for `vision_config`. Expected `PretrainedConfig`, `dict`, or `None`, got {type(vision_config)}."
            )

        if isinstance(text_config, dict):
            text_config = Qwen3VLTextConfig(**deepcopy(text_config))
        elif not isinstance(text_config, PretrainedConfig):
            raise TypeError(
                f"Invalid type for `text_config`. Expected `PretrainedConfig`, `dict`, or `None`, got {type(text_config)}."
            )

        if embed_dim <= 0:
            raise ValueError(f"`embed_dim` must be positive, got {embed_dim}.")

        super().__init__(**kwargs)
        self.vision_config = vision_config
        self.text_config = text_config
        self.embed_dim = embed_dim
        self.padding_side = padding_side
        self.initializer_range = initializer_range
        # Preserve incoming dtype so downstream models avoid attribute errors
        self.dtype = dtype or getattr(self, "dtype", None)

    @classmethod
    def from_base_config(cls, base_config: PretrainedConfig) -> "ColQwen3Config":
        """Upgrade a base Qwen3VLConfig-like config into ColQwen3Config."""
        if isinstance(base_config, dict):
            data = dict(base_config)
        else:
            data = base_config.to_dict()

        vision_cfg = data.get("vision_config")
        if isinstance(vision_cfg, dict):
            data["vision_config"] = Qwen3VLVisionConfig.from_dict(vision_cfg)

        text_cfg = data.get("text_config")
        if isinstance(text_cfg, dict):
            data["text_config"] = Qwen3VLTextConfig.from_dict(text_cfg)

        data.setdefault("model_type", cls.model_type)
        if hasattr(base_config, "dtype"):
            data.setdefault("dtype", getattr(base_config, "dtype"))
        elif hasattr(base_config, "torch_dtype") and base_config.torch_dtype is not None:
            data.setdefault("dtype", str(base_config.torch_dtype))

        return cls.from_dict(data)

    def get_text_config(self, *args, **kwargs) -> PretrainedConfig:
        return self.text_config


DEFAULT_CONFIG = ColQwen3Config()

__all__ = ["ColQwen3Config", "DEFAULT_CONFIG"]