support vllm (#6)

- support vllm (24b80d31a032dfc653c12c049c1370ff992caae3)
- Update config.json (d1daf81e9b327a9042c3f7c518f4a83f266adbd9)
- Rename configuration_ernie_45t_vl.py to configuration_ernie4_5_vl.py (3990673651b8a78fbe0d4a31f2f81d522cebed48)
- Rename modeling_ernie_45t_vl.py to modeling_ernie4_5_vl.py (88de2604ae51bccd3f129e8fa4296c537a81dd75)
- Rename processing_ernie_45t_vl.py to processing_ernie4_5_vl.py (202ec6ce58d4d10c88a1de2abb05f28711d2a001)
- Update tokenizer_config.json (28dc5d11cf1e4437d9bb0d1a78ee1ea904dfe2b9)

Files changed (6) hide show

chat_template.json +1 -1
config.json +15 -5
configuration_ernie_45t_vl.py → configuration_ernie4_5_vl.py +1 -1
modeling_ernie_45t_vl.py → modeling_ernie4_5_vl.py +1 -1
processing_ernie_45t_vl.py → processing_ernie4_5_vl.py +306 -11
tokenizer_config.json +2 -2

chat_template.json CHANGED Viewed

@@ -1,3 +1,3 @@
 {
-    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }

 {
+    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type in ['image_url', 'image'] -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type in ['video_url', 'video'] -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }

config.json CHANGED Viewed

@@ -3,11 +3,11 @@
     "Ernie4_5_VLMoeForConditionalGeneration"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_ernie_45t_vl.Ernie4_5_VLMoEConfig",
-    "AutoModel": "modeling_ernie_45t_vl.Ernie4_5_VLMoeForConditionalGeneration",
-    "AutoModelForCausalLM": "modeling_ernie_45t_vl.Ernie4_5_VLMoeForConditionalGeneration",
-    "AutoProcessor": "processing_ernie_45t_vl.Ernie_45T_VLProcessor",
-    "AutoImageProcessor": "processing_ernie_45t_vl.Ernie_45T_VLImageProcessor"
   },
   "pad_token_id": 0,
   "bos_token_id": 1,
@@ -16,6 +16,8 @@
   "hidden_size": 8192,
   "intermediate_size": 28672,
   "im_patch_id": 100295,
   "max_position_embeddings": 131072,
   "num_attention_heads": 64,
   "num_key_value_heads": 8,
@@ -43,6 +45,14 @@
   "torch_dtype": "bfloat16",
   "tie_word_embeddings": false,
   "moe_multimodal_dispatch_use_allgather": "v2-alltoall-unpad-text",
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

     "Ernie4_5_VLMoeForConditionalGeneration"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_ernie4_5_vl.Ernie4_5_VLMoEConfig",
+    "AutoModel": "modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
+    "AutoProcessor": "processing_ernie4_5_vl.Ernie4_5_VLProcessor",
+    "AutoImageProcessor": "processing_ernie4_5_vl.Ernie4_5_VLImageProcessor"
   },
   "pad_token_id": 0,
   "bos_token_id": 1,
   "hidden_size": 8192,
   "intermediate_size": 28672,
   "im_patch_id": 100295,
+  "video_start_token_id": 101306,
+  "video_end_token_id": 101307,
   "max_position_embeddings": 131072,
   "num_attention_heads": 64,
   "num_key_value_heads": 8,
   "torch_dtype": "bfloat16",
   "tie_word_embeddings": false,
   "moe_multimodal_dispatch_use_allgather": "v2-alltoall-unpad-text",
+  "rope_scaling": {
+    "type": "default",
+    "mrope_section": [
+      22,
+      22,
+      20
+    ]
+  },
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

configuration_ernie_45t_vl.py → configuration_ernie4_5_vl.py RENAMED Viewed

@@ -171,7 +171,7 @@ class Ernie4_5_Config(PretrainedConfig):
         use_fast_ln=False,
         weight_share_add_bias=True,
         fuse_linear=False,
-        max_sequence_length=1024,
         ignored_index=-100,
         add_tail_layers=False,
         use_recompute_lm_head=False,

         use_fast_ln=False,
         weight_share_add_bias=True,
         fuse_linear=False,
+        max_sequence_length=None,
         ignored_index=-100,
         add_tail_layers=False,
         use_recompute_lm_head=False,

modeling_ernie_45t_vl.py → modeling_ernie4_5_vl.py RENAMED Viewed

@@ -35,7 +35,7 @@ from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from .configuration_ernie_45t_vl import (
     DFNRopeVisionTransformerConfig,
     Ernie4_5_MoEConfig,
     Ernie4_5_VLMoEConfig,

 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+from .configuration_ernie4_5_vl import (
     DFNRopeVisionTransformerConfig,
     Ernie4_5_MoEConfig,
     Ernie4_5_VLMoEConfig,

processing_ernie_45t_vl.py → processing_ernie4_5_vl.py RENAMED Viewed

@@ -26,6 +26,7 @@ import hashlib
 import threading
 import uuid
 import decord
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -43,8 +44,12 @@ except:
     # moviepy 2.0
     import moviepy as mp
-from .tokenization_ernie_45t_vl import Ernie4_5_VLTokenizer
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
@@ -74,6 +79,293 @@ from transformers.image_utils import (
 logger = logging.get_logger(__name__)
 def round_by_factor(number: int, factor: int) -> int:
     """Returns the closest integer to 'number' that is divisible by 'factor'."""
     return round(number / factor) * factor
@@ -199,7 +491,7 @@ def make_batched_videos(videos) -> List[VideoInput]:
     raise ValueError(f"Could not make batched video from {videos}")
-class Ernie_45T_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -281,7 +573,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
@@ -290,7 +582,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
@@ -1089,7 +1381,7 @@ def render_frame_timestamp(frame, timestamp, font_rate=0.1):
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
-class Ernie_45T_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
@@ -1236,11 +1528,11 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
     def __call__(
         self,
-        text: List[str],
-        images: List[Image.Image],
-        videos: List[List[Image.Image]],
         **kwargs,
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
@@ -1256,6 +1548,9 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
             "pic_cnt": 0,
             "video_cnt": 0,
         }
         texts = text[0]
         new_video_seg = True
@@ -1520,4 +1815,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
         return list(tokenizer_input_names) + list(image_processor_input_names)
-__all__ = ["Ernie_45T_VLImageProcessor", "Ernie_45T_VLProcessor"]

 import threading
 import uuid
 import decord
+from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
     # moviepy 2.0
     import moviepy as mp
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    TextInput,
+)
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
 logger = logging.get_logger(__name__)
+class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
+    """
+    Ernie4_5_VLTokenizer
+    """
+    vocab_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    # Model input names expected by the tokenizer
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    # Padding side (where to add padding tokens)
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        """
+        Initialize the Ernie4_5_VLTokenizer
+        Args:
+            vocab_file (str): Path to the tokenizer vocabulary model.
+            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
+            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
+            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
+            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
+            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
+            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
+            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
+            additional_special_tokens (List[str], optional): Additional special tokens to use.
+                Defaults to `["<mask:1>", "<mask:7>"]`.
+            **kwargs (dict): Additional keyword arguments passed along to the superclass.
+        """
+        # Store vocabulary file path
+        self.vocab_file = vocab_file
+        # Initialize SentencePiece processor
+        self.sp_model = spm.SentencePieceProcessor()
+        # Load the vocabulary model
+        self.sp_model.Load(vocab_file)
+        # Set default additional special tokens if none provided
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+    @property
+    def space_token(self):
+        """Return the space token"""
+        return "<mask:1>"
+    @property
+    def space_token_id(self):
+        """Return the ID of the space token"""
+        return self.sp_model.piece_to_id("<mask:1>")
+    @property
+    def gend_token(self):
+        """Return the gender token"""
+        return "<mask:7>"
+    @property
+    def gend_token_id(self):
+        """Return the ID of the gender token"""
+        return self.sp_model.piece_to_id("<mask:7>")
+    @property
+    def im_start_id(self):
+        """Return the ID of the image start token"""
+        return self.sp_model.piece_to_id("<|im_start|>")
+    @property
+    def im_end_id(self):
+        """Return the ID of the image end token"""
+        return self.sp_model.piece_to_id("<|im_end|>")
+    @property
+    def vocab_size(self):
+        """Return the size of the vocabulary"""
+        return self.sp_model.vocab_size()
+    def get_vocab(self):
+        """Return the vocabulary as a dictionary mapping tokens to IDs"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize the input text into pieces"""
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        """Convert a token to its corresponding ID"""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        """Convert an ID to its corresponding token"""
+        return self.sp_model.id_to_piece(id)
+    def convert_tokens_to_string(self, tokens):
+        """Convert a sequence of tokens back to a string"""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # Handle special tokens differently
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        # Add any remaining sub-tokens
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def prepare_for_model(self, *args, **kwargs):
+        """Prepare the tokenized inputs for the model"""
+        # Remove add_special_tokens if present (not supported)
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+        return super().prepare_for_model(*args, **kwargs)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`): The directory to save the vocabulary to
+            filename_prefix (`str`, optional): Prefix to add to the filename
+        Returns:
+            `Tuple(str)`: Paths to the saved files
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        # Construct output vocabulary file path
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        # Copy or create vocabulary file
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        """Decode token_id back to text"""
+        # Remove some parameters that aren't used
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        # Call parent decode method with specific parameters
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+    def _pad(
+        self,
+        encoded_inputs: Dict,
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs
+    ) -> dict:
+        """Pad the encoded inputs to the specified length"""
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_attention_mask:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            if padding_strategy == PaddingStrategy.LONGEST:
+                max_length = len(required_input)
+            # Adjust max_length if needed for multiple of padding
+            if (
+                max_length is not None
+                and pad_to_multiple_of is not None
+                and (max_length % pad_to_multiple_of != 0)
+            ):
+                max_length = (
+                    (max_length // pad_to_multiple_of) + 1
+                ) * pad_to_multiple_of
+            # Check if padding is needed
+            needs_to_be_padded = (
+                padding_strategy != PaddingStrategy.DO_NOT_PAD
+                and len(required_input) != max_length
+            )
+            # Handle attention mask if present
+            if (
+                "attention_mask" in encoded_inputs
+                and encoded_inputs["attention_mask"] is not None
+            ):
+                attention_mask = encoded_inputs.pop("attention_mask")
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = attention_mask.numpy()
+                elif isinstance(attention_mask, list):
+                    attention_mask = np.array(attention_mask)
+                elif not isinstance(attention_mask, np.ndarray):
+                    raise ValueError(
+                        f"Unexpected type {type(attention_mask)} of attention_mask, "
+                    )
+            else:
+                # Create default attention mask if none provided
+                attention_mask = np.tril(
+                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
+                )
+                attention_mask = np.expand_dims(attention_mask, axis=0)
+            # Perform padding if needed
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if self.padding_side == "right":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(0, difference)]
+                    else:
+                        pad_width = [(0, 0), (0, difference), (0, difference)]
+                elif self.padding_side == "left":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(difference, 0)]
+                    else:
+                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
+                else:
+                    raise ValueError(
+                        "Invalid padding strategy:" + str(self.padding_side)
+                    )
+                attention_mask = np.pad(
+                    attention_mask,
+                    pad_width=pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+        # Call parent padding method
+        encoded_inputs = super()._pad(
+            encoded_inputs,
+            max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=False,
+        )
+        # Add attention mask back if needed
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = attention_mask.tolist()
+        return encoded_inputs
 def round_by_factor(number: int, factor: int) -> int:
     """Returns the closest integer to 'number' that is divisible by 'factor'."""
     return round(number / factor) * factor
     raise ValueError(f"Could not make batched video from {videos}")
+class Ernie4_5_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
+class Ernie4_5_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
     def __call__(
         self,
+        text: Union[str, List[str]],
+        images: List[Image.Image] = [],
+        videos: List[List[Image.Image]] = [],
         **kwargs,
+    ) -> BatchFeature:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
             "pic_cnt": 0,
             "video_cnt": 0,
         }
+        if not isinstance(text, list):
+            text = [text]
         texts = text[0]
         new_video_seg = True
         return list(tokenizer_input_names) + list(image_processor_input_names)
+__all__ = ["Ernie4_5_VLTokenizer", "Ernie4_5_VLImageProcessor", "Ernie4_5_VLProcessor"]

tokenizer_config.json CHANGED Viewed

@@ -14,9 +14,9 @@
     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
-            "tokenization_ernie_45t_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
-    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }

     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
+            "processing_ernie4_5_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
+    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type in ['image_url', 'image'] -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type in ['video_url', 'video'] -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }