Text Generation
MLX
Safetensors
English
Korean
solar_open
upstage
solar
Mixture of Experts
100b
llm
conversational
custom_code
6-bit
Instructions to use mlx-community/Solar-Open-100B-6bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/Solar-Open-100B-6bit with MLX:
# Make sure mlx-lm is installed # pip install --upgrade mlx-lm # Generate text with mlx-lm from mlx_lm import load, generate model, tokenizer = load("mlx-community/Solar-Open-100B-6bit") prompt = "Write a story about Einstein" messages = [{"role": "user", "content": prompt}] prompt = tokenizer.apply_chat_template( messages, add_generation_prompt=True ) text = generate(model, tokenizer, prompt=prompt, verbose=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
- Pi new
How to use mlx-community/Solar-Open-100B-6bit with Pi:
Start the MLX server
# Install MLX LM: uv tool install mlx-lm # Start a local OpenAI-compatible server: mlx_lm.server --model "mlx-community/Solar-Open-100B-6bit"
Configure the model in Pi
# Install Pi: npm install -g @mariozechner/pi-coding-agent # Add to ~/.pi/agent/models.json: { "providers": { "mlx-lm": { "baseUrl": "http://localhost:8080/v1", "api": "openai-completions", "apiKey": "none", "models": [ { "id": "mlx-community/Solar-Open-100B-6bit" } ] } } }Run Pi
# Start Pi in your project directory: pi
- Hermes Agent new
How to use mlx-community/Solar-Open-100B-6bit with Hermes Agent:
Start the MLX server
# Install MLX LM: uv tool install mlx-lm # Start a local OpenAI-compatible server: mlx_lm.server --model "mlx-community/Solar-Open-100B-6bit"
Configure Hermes
# Install Hermes: curl -fsSL https://hermes-agent.nousresearch.com/install.sh | bash hermes setup # Point Hermes at the local server: hermes config set model.provider custom hermes config set model.base_url http://127.0.0.1:8080/v1 hermes config set model.default mlx-community/Solar-Open-100B-6bit
Run Hermes
hermes
- MLX LM
How to use mlx-community/Solar-Open-100B-6bit with MLX LM:
Generate or start a chat session
# Install MLX LM uv tool install mlx-lm # Interactive chat REPL mlx_lm.chat --model "mlx-community/Solar-Open-100B-6bit"
Run an OpenAI-compatible server
# Install MLX LM uv tool install mlx-lm # Start the server mlx_lm.server --model "mlx-community/Solar-Open-100B-6bit" # Calling the OpenAI-compatible server with curl curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mlx-community/Solar-Open-100B-6bit", "messages": [ {"role": "user", "content": "Hello"} ] }'
| # coding=utf-8 | |
| # Copyright 2025 Upstage AI. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import os | |
| from enum import Enum | |
| from typing import TYPE_CHECKING | |
| import torch | |
| from vllm.sampling_params import SamplingParams | |
| from vllm.v1.sample.logits_processor import ( | |
| AdapterLogitsProcessor, | |
| RequestLogitsProcessor, | |
| ) | |
| if TYPE_CHECKING: | |
| from vllm.config import VllmConfig | |
| # Hardcoded token IDs for Solar tokenizer | |
| # Special token IDs for chat template | |
| BEGIN_TOKEN_ID = 20 # <|begin|> | |
| END_TOKEN_ID = 21 # <|end|> | |
| THINK_TOKEN_ID = 22 # <|think|> | |
| CONTENT_TOKEN_ID = 23 # <|content|> | |
| FLUSH_TOKEN_ID = 24 # <|flush|> (eos token) | |
| ASSISTANT_TOKEN_ID = 163444 # assistant | |
| ''' | |
| 'assistant' is not a special token exactly, but is treated as one in the logits | |
| processing. | |
| ''' | |
| # Tool call related tokens | |
| CALLS_TOKEN_ID = 25 # <|calls|> (eos token for tool calls) | |
| TOOL_CALLS_TOKEN_ID = 30 # <|tool_calls|> | |
| TOOL_CALL_BEGIN_TOKEN_ID = 31 # <|tool_call:begin|> | |
| TOOL_CALL_END_TOKEN_ID = 32 # <|tool_call:end|> | |
| TOOL_CALL_NAME_TOKEN_ID = 33 # <|tool_call:name|> | |
| TOOL_CALL_ARGS_TOKEN_ID = 34 # <|tool_call:args|> | |
| # ============================================================================= | |
| # Dynamic Reasoning Budget Configuration | |
| # ============================================================================= | |
| # budget = min(max_budget, max(min_budget, max_tokens * ratio / 100)) | |
| # Priority: max_budget > min_budget > ratio | |
| # | |
| # Available environment variables: | |
| # HIGH effort: | |
| # SOLAR_REASONING_BUDGET_HIGH_MAX (default: 32768) - max_budget | |
| # SOLAR_REASONING_BUDGET_HIGH_MIN (default: 8192) - min_budget | |
| # SOLAR_REASONING_BUDGET_HIGH_RATIO (default: 60) - % of max_tokens | |
| # | |
| # MEDIUM effort: | |
| # SOLAR_REASONING_BUDGET_MEDIUM_MAX (default: 16384) - max_budget | |
| # SOLAR_REASONING_BUDGET_MEDIUM_MIN (default: 4096) - min_budget | |
| # SOLAR_REASONING_BUDGET_MEDIUM_RATIO (default: 30) - % of max_tokens | |
| # | |
| # Tool call: | |
| # SOLAR_TOOL_CALL_ID_BUDGET (default: 10) - Max tokens for tool call ID | |
| # ============================================================================= | |
| DEFAULT_REASONING_EFFORT = "high" | |
| # HIGH effort settings (1k = 1024 tokens) | |
| DEFAULT_REASONING_BUDGET_HIGH_MAX = 32 * 1024 | |
| DEFAULT_REASONING_BUDGET_HIGH_MIN = 8 * 1024 | |
| DEFAULT_REASONING_BUDGET_HIGH_RATIO = 60 | |
| # MEDIUM effort settings | |
| DEFAULT_REASONING_BUDGET_MEDIUM_MAX = 16 * 1024 | |
| DEFAULT_REASONING_BUDGET_MEDIUM_MIN = 4 * 1024 | |
| DEFAULT_REASONING_BUDGET_MEDIUM_RATIO = 30 | |
| # Tool call settings | |
| DEFAULT_TOOL_CALL_ID_BUDGET = 10 | |
| # Pre-computed constant to avoid repeated string parsing | |
| NEG_INF = float("-inf") | |
| def is_reasoning_request(params: SamplingParams) -> bool: | |
| """Check if the request is a reasoning request based on reasoning_effort.""" | |
| return (params.reasoning_effort is None) or (params.reasoning_effort in ("medium", "high")) | |
| def is_structured_outputs(params: SamplingParams) -> bool: | |
| """Check if the request has structured outputs constraints.""" | |
| return ( | |
| params.structured_outputs is not None | |
| and not params.structured_outputs.all_constraints_none() | |
| ) | |
| class GenerationState(Enum): | |
| """Enum representing the current state of response generation.""" | |
| # Initial state - no tokens generated yet | |
| INITIAL = "initial" | |
| # New message states (after think_end) | |
| NEW_MESSAGE_BEGIN = "new_message_begin" # <|begin|> token was just generated | |
| NEW_MESSAGE_ASSISTANT = "new_message_assistant" # assistant token after <|begin|> | |
| # Think mode states | |
| THINK_BEGIN = "think_begin" # <|think|> token was just generated | |
| THINK_IN_PROGRESS = "think_in_progress" # Generating think content | |
| THINK_END = "think_end" # <|end|> after think content | |
| THINK_FLUSH = "think_flush" # <|flush|> after think content | |
| # Content states | |
| CONTENT_BEGIN = "content_begin" # <|content|> token was just generated | |
| CONTENT_IN_PROGRESS = "content_in_progress" # Generating content | |
| CONTENT_END = "content_end" # <|end|> or <|flush|> after content | |
| CONTENT_FLUSH = "content_flush" # <|flush|> after content | |
| # Tool call states | |
| # Flow: <|tool_calls|> -> (<|tool_call:begin|> -> id -> <|tool_call:name|> -> name -> <|tool_call:args|> -> args -> <|tool_call:end|>)+ -> <|calls|> | |
| # Note: Think message can appear before <|tool_calls|> | |
| TOOL_CALLS_BEGIN = "tool_calls_begin" # <|tool_calls|> token was just generated | |
| TOOL_CALL_BEGIN = "tool_call_begin" # <|tool_call:begin|> token was just generated | |
| TOOL_CALL_ID_IN_PROGRESS = "tool_call_id_in_progress" # Generating tool call ID | |
| TOOL_CALL_NAME_BEGIN = "tool_call_name_begin" # <|tool_call:name|> token was just generated | |
| TOOL_CALL_NAME_IN_PROGRESS = "tool_call_name_in_progress" # Generating tool name | |
| TOOL_CALL_ARGS_BEGIN = "tool_call_args_begin" # <|tool_call:args|> token was just generated | |
| TOOL_CALL_ARGS_IN_PROGRESS = "tool_call_args_in_progress" # Generating tool arguments (JSON) | |
| TOOL_CALL_END = "tool_call_end" # <|tool_call:end|> token was just generated (can start another tool call or end) | |
| CALLS = "calls" # <|calls|> token was just generated (eos token for tool calls) | |
| def get_generation_state( | |
| output_token_ids: list[int], | |
| begin_token_id: int = BEGIN_TOKEN_ID, | |
| end_token_id: int = END_TOKEN_ID, | |
| flush_token_id: int = FLUSH_TOKEN_ID, | |
| think_token_id: int = THINK_TOKEN_ID, | |
| content_token_id: int = CONTENT_TOKEN_ID, | |
| tool_calls_token_id: int = TOOL_CALLS_TOKEN_ID, | |
| tool_call_begin_token_id: int = TOOL_CALL_BEGIN_TOKEN_ID, | |
| tool_call_name_token_id: int = TOOL_CALL_NAME_TOKEN_ID, | |
| tool_call_args_token_id: int = TOOL_CALL_ARGS_TOKEN_ID, | |
| tool_call_end_token_id: int = TOOL_CALL_END_TOKEN_ID, | |
| calls_token_id: int = CALLS_TOKEN_ID, | |
| assistant_token_id: int = ASSISTANT_TOKEN_ID, | |
| ) -> GenerationState: | |
| """Determine the current generation state based on output token IDs. | |
| Analyzes the sequence of generated tokens to determine which phase | |
| of the chat template the generation is currently in. | |
| Response format specs: | |
| - think mode: <|think|>{{think-tokens}}<|end|><|begin|>assistant<|content|>{{content-tokens}}<|flush|> | |
| - tool mode: <|begin|>assistant<|tool_calls|><|tool_call:begin|>{{id}}<|tool_call:name|>{{name}}<|tool_call:args|>{{args}}<|tool_call:end|><|calls|> | |
| - tool mode (with think): <|think|>{{think-tokens}}<|end|><|begin|>assistant<|tool_calls|>...<|calls|> | |
| - no-think mode: <|content|>{{content-tokens}}<|flush|> | |
| Args: | |
| output_token_ids: List of token IDs generated so far. | |
| begin_token_id: Token ID for <|begin|>. | |
| end_token_id: Token ID for <|end|>. | |
| flush_token_id: Token ID for <|flush|> (eos). | |
| think_token_id: Token ID for <|think|>. | |
| content_token_id: Token ID for <|content|>. | |
| tool_calls_token_id: Token ID for <|tool_calls|>. | |
| tool_call_begin_token_id: Token ID for <|tool_call:begin|>. | |
| tool_call_name_token_id: Token ID for <|tool_call:name|>. | |
| tool_call_args_token_id: Token ID for <|tool_call:args|>. | |
| tool_call_end_token_id: Token ID for <|tool_call:end|>. | |
| calls_token_id: Token ID for <|calls|> (eos). | |
| assistant_token_id: Token ID for assistant. | |
| Returns: | |
| GenerationState indicating the current phase of generation. | |
| """ | |
| if not output_token_ids: | |
| return GenerationState.INITIAL | |
| # Track state by scanning through tokens | |
| state = GenerationState.INITIAL | |
| in_think = False | |
| in_content = False | |
| for token_id in output_token_ids: | |
| if token_id == think_token_id: | |
| state = GenerationState.THINK_BEGIN | |
| in_think = True | |
| in_content = False | |
| elif token_id == content_token_id: | |
| state = GenerationState.CONTENT_BEGIN | |
| in_content = True | |
| in_think = False | |
| elif token_id == tool_calls_token_id: | |
| state = GenerationState.TOOL_CALLS_BEGIN | |
| in_think = False | |
| in_content = False | |
| elif token_id == tool_call_begin_token_id: | |
| state = GenerationState.TOOL_CALL_BEGIN | |
| elif token_id == tool_call_name_token_id: | |
| state = GenerationState.TOOL_CALL_NAME_BEGIN | |
| elif token_id == tool_call_args_token_id: | |
| state = GenerationState.TOOL_CALL_ARGS_BEGIN | |
| elif token_id == tool_call_end_token_id: | |
| state = GenerationState.TOOL_CALL_END | |
| elif token_id == calls_token_id: | |
| state = GenerationState.CALLS | |
| elif token_id == begin_token_id: | |
| state = GenerationState.NEW_MESSAGE_BEGIN | |
| elif token_id == assistant_token_id: | |
| if state == GenerationState.NEW_MESSAGE_BEGIN: | |
| state = GenerationState.NEW_MESSAGE_ASSISTANT | |
| elif token_id == end_token_id: | |
| if in_think: | |
| state = GenerationState.THINK_END | |
| in_think = False | |
| elif in_content: | |
| state = GenerationState.CONTENT_END | |
| in_content = False | |
| elif token_id == flush_token_id: | |
| if in_think: | |
| state = GenerationState.THINK_FLUSH | |
| in_think = False | |
| elif in_content: | |
| state = GenerationState.CONTENT_FLUSH | |
| in_content = False | |
| else: | |
| # Regular token - update state based on current context | |
| if state == GenerationState.THINK_BEGIN: | |
| state = GenerationState.THINK_IN_PROGRESS | |
| elif state == GenerationState.THINK_IN_PROGRESS: | |
| pass # Stay in think_in_progress | |
| elif state == GenerationState.CONTENT_BEGIN: | |
| state = GenerationState.CONTENT_IN_PROGRESS | |
| elif state == GenerationState.CONTENT_IN_PROGRESS: | |
| pass # Stay in content_in_progress | |
| elif state == GenerationState.TOOL_CALL_BEGIN: | |
| state = GenerationState.TOOL_CALL_ID_IN_PROGRESS | |
| elif state == GenerationState.TOOL_CALL_ID_IN_PROGRESS: | |
| pass # Stay in tool_call_id_in_progress | |
| elif state == GenerationState.TOOL_CALL_NAME_BEGIN: | |
| state = GenerationState.TOOL_CALL_NAME_IN_PROGRESS | |
| elif state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS: | |
| pass # Stay in tool_call_name_in_progress | |
| elif state == GenerationState.TOOL_CALL_ARGS_BEGIN: | |
| state = GenerationState.TOOL_CALL_ARGS_IN_PROGRESS | |
| elif state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS: | |
| pass # Stay in tool_call_args_in_progress | |
| return state | |
| # Pre-computed list of all special token IDs for batch indexing | |
| _ALL_SPECIAL_TOKEN_IDS = [ | |
| BEGIN_TOKEN_ID, | |
| END_TOKEN_ID, | |
| THINK_TOKEN_ID, | |
| CONTENT_TOKEN_ID, | |
| FLUSH_TOKEN_ID, | |
| CALLS_TOKEN_ID, | |
| TOOL_CALLS_TOKEN_ID, | |
| TOOL_CALL_BEGIN_TOKEN_ID, | |
| TOOL_CALL_END_TOKEN_ID, | |
| TOOL_CALL_NAME_TOKEN_ID, | |
| TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| # Pre-computed lists for state-specific batch indexing (excluding allowed tokens) | |
| _SPECIAL_EXCEPT_END = [ # For THINK states (allow END) | |
| BEGIN_TOKEN_ID, FLUSH_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID, | |
| TOOL_CALLS_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID, | |
| TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| _SPECIAL_EXCEPT_CONTENT_TOOLCALLS = [ # For NEW_MESSAGE_ASSISTANT (allow CONTENT, TOOL_CALLS) | |
| THINK_TOKEN_ID, BEGIN_TOKEN_ID, END_TOKEN_ID, FLUSH_TOKEN_ID, | |
| CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID, | |
| TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| _SPECIAL_EXCEPT_FLUSH = [ # For CONTENT states (allow FLUSH) | |
| BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID, | |
| TOOL_CALLS_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID, | |
| TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| _SPECIAL_EXCEPT_TOOLCALL_NAME = [ # For TOOL_CALL_ID_IN_PROGRESS (allow TOOL_CALL_NAME) | |
| BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID, | |
| FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID, | |
| TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| _SPECIAL_EXCEPT_TOOLCALL_ARGS = [ # For TOOL_CALL_NAME_IN_PROGRESS (allow TOOL_CALL_ARGS) | |
| BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID, | |
| FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID, | |
| TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, | |
| ] | |
| _SPECIAL_EXCEPT_TOOLCALL_END = [ # For TOOL_CALL_ARGS_IN_PROGRESS (allow TOOL_CALL_END) | |
| BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID, | |
| FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID, | |
| TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID, | |
| ] | |
| def _forbid_all_special_tokens(logits: torch.Tensor) -> None: | |
| """Set all special token logits to -inf.""" | |
| logits[_ALL_SPECIAL_TOKEN_IDS] = NEG_INF | |
| class SolarOpenTemplateEnforcer: | |
| """Request-level logits processor that enforces Solar Open chat template. | |
| Enforces the following generation rules: | |
| - think mode: <|think|>{{tokens}}<|end|><|begin|>assistant<|content|>{{tokens}}<|flush|> | |
| - tool mode: <|tool_calls|><|tool_call:begin|>{{id}}<|tool_call:name|>{{name}}<|tool_call:args|>{{args}}<|tool_call:end|><|calls|> | |
| - tool+think mode: <|think|>{{tokens}}<|end|><|begin|>assistant<|tool_calls|>...<|calls|> | |
| - no-think mode: <|content|>{{tokens}}<|flush|> | |
| Key constraints: | |
| - Think message can only appear first | |
| - Think message must be followed by another message | |
| - Content and tool messages cannot coexist | |
| - Maximum 2 messages (think + content/tool, or just content/tool) | |
| Performance optimization: | |
| - Uses incremental state tracking to avoid full token sequence scan on each call | |
| - Maintains local counters for budget tracking | |
| - Uses pre-computed constants to avoid repeated object creation | |
| """ | |
| # Pre-computed frozenset for reasoning state check (avoids set creation per call) | |
| _REASONING_STATES = frozenset({ | |
| GenerationState.INITIAL, | |
| GenerationState.THINK_BEGIN, | |
| GenerationState.THINK_IN_PROGRESS, | |
| }) | |
| def __init__( | |
| self, | |
| is_reasoning_request: bool, | |
| is_structured_outputs: bool, | |
| reasoning_budget: int | None = None, | |
| tool_call_id_budget: int = DEFAULT_TOOL_CALL_ID_BUDGET, | |
| ): | |
| self._is_reasoning_request = is_reasoning_request | |
| self._is_structured_outputs = is_structured_outputs | |
| self._reasoning_budget = reasoning_budget | |
| self._tool_call_id_budget = tool_call_id_budget | |
| # Incremental state tracking | |
| self._state = GenerationState.INITIAL | |
| self._last_processed_len = 0 | |
| self._in_think = False | |
| self._in_content = False | |
| # Budget counters | |
| self._think_token_count = 0 | |
| self._tool_call_id_token_count = 0 | |
| def _reset_state(self) -> None: | |
| """Reset all incremental state to initial values. | |
| Called when defensive reprocessing is needed (e.g., token sequence inconsistency). | |
| """ | |
| self._state = GenerationState.INITIAL | |
| self._last_processed_len = 0 | |
| self._in_think = False | |
| self._in_content = False | |
| self._think_token_count = 0 | |
| self._tool_call_id_token_count = 0 | |
| def _process_token(self, token_id: int) -> None: | |
| """Process a single token and update internal state incrementally. | |
| Args: | |
| token_id: The token ID to process. | |
| """ | |
| if token_id == THINK_TOKEN_ID: | |
| self._state = GenerationState.THINK_BEGIN | |
| self._in_think = True | |
| self._in_content = False | |
| self._think_token_count = 0 # Reset counter for new think block | |
| elif token_id == CONTENT_TOKEN_ID: | |
| self._state = GenerationState.CONTENT_BEGIN | |
| self._in_content = True | |
| self._in_think = False | |
| elif token_id == TOOL_CALLS_TOKEN_ID: | |
| self._state = GenerationState.TOOL_CALLS_BEGIN | |
| self._in_think = False | |
| self._in_content = False | |
| elif token_id == TOOL_CALL_BEGIN_TOKEN_ID: | |
| self._state = GenerationState.TOOL_CALL_BEGIN | |
| self._tool_call_id_token_count = 0 # Reset counter for new tool call | |
| elif token_id == TOOL_CALL_NAME_TOKEN_ID: | |
| self._state = GenerationState.TOOL_CALL_NAME_BEGIN | |
| elif token_id == TOOL_CALL_ARGS_TOKEN_ID: | |
| self._state = GenerationState.TOOL_CALL_ARGS_BEGIN | |
| elif token_id == TOOL_CALL_END_TOKEN_ID: | |
| self._state = GenerationState.TOOL_CALL_END | |
| elif token_id == CALLS_TOKEN_ID: | |
| self._state = GenerationState.CALLS | |
| elif token_id == BEGIN_TOKEN_ID: | |
| self._state = GenerationState.NEW_MESSAGE_BEGIN | |
| elif token_id == ASSISTANT_TOKEN_ID: | |
| if self._state == GenerationState.NEW_MESSAGE_BEGIN: | |
| self._state = GenerationState.NEW_MESSAGE_ASSISTANT | |
| elif token_id == END_TOKEN_ID: | |
| if self._in_think: | |
| self._state = GenerationState.THINK_END | |
| self._in_think = False | |
| elif self._in_content: | |
| self._state = GenerationState.CONTENT_END | |
| self._in_content = False | |
| elif token_id == FLUSH_TOKEN_ID: | |
| if self._in_think: | |
| self._state = GenerationState.THINK_FLUSH | |
| self._in_think = False | |
| elif self._in_content: | |
| self._state = GenerationState.CONTENT_FLUSH | |
| self._in_content = False | |
| else: | |
| # Regular token - update state and counters based on current context | |
| if self._state == GenerationState.THINK_BEGIN: | |
| self._state = GenerationState.THINK_IN_PROGRESS | |
| self._think_token_count += 1 | |
| elif self._state == GenerationState.THINK_IN_PROGRESS: | |
| self._think_token_count += 1 | |
| elif self._state == GenerationState.CONTENT_BEGIN: | |
| self._state = GenerationState.CONTENT_IN_PROGRESS | |
| elif self._state == GenerationState.CONTENT_IN_PROGRESS: | |
| pass # Stay in content_in_progress | |
| elif self._state == GenerationState.TOOL_CALL_BEGIN: | |
| self._state = GenerationState.TOOL_CALL_ID_IN_PROGRESS | |
| self._tool_call_id_token_count += 1 | |
| elif self._state == GenerationState.TOOL_CALL_ID_IN_PROGRESS: | |
| self._tool_call_id_token_count += 1 | |
| elif self._state == GenerationState.TOOL_CALL_NAME_BEGIN: | |
| self._state = GenerationState.TOOL_CALL_NAME_IN_PROGRESS | |
| elif self._state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS: | |
| pass # Stay in tool_call_name_in_progress | |
| elif self._state == GenerationState.TOOL_CALL_ARGS_BEGIN: | |
| self._state = GenerationState.TOOL_CALL_ARGS_IN_PROGRESS | |
| elif self._state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS: | |
| pass # Stay in tool_call_args_in_progress | |
| def _update_state_incremental(self, output_token_ids: list[int]) -> None: | |
| """Update internal state by processing only new tokens. | |
| Args: | |
| output_token_ids: Full list of output token IDs. | |
| """ | |
| current_len = len(output_token_ids) | |
| # Defensive check: if token sequence is shorter than expected, reset and reprocess | |
| if current_len < self._last_processed_len: | |
| self._reset_state() | |
| # Process only new tokens | |
| for i in range(self._last_processed_len, current_len): | |
| self._process_token(output_token_ids[i]) | |
| self._last_processed_len = current_len | |
| def _count_think_tokens(output_token_ids: list[int]) -> int: | |
| """Count the number of tokens generated after <|think|> token. | |
| Returns 0 if <|think|> token is not found (defensive). | |
| Note: This static method is kept for backward compatibility and testing. | |
| The incremental version uses _think_token_count instead. | |
| """ | |
| try: | |
| think_index = output_token_ids.index(THINK_TOKEN_ID) | |
| return len(output_token_ids) - think_index - 1 | |
| except ValueError: | |
| return 0 | |
| def _count_tool_call_id_tokens(output_token_ids: list[int]) -> int: | |
| """Count the number of tokens generated after the last <|tool_call:begin|> token. | |
| Returns 0 if <|tool_call:begin|> token is not found (defensive). | |
| Note: This static method is kept for backward compatibility and testing. | |
| The incremental version uses _tool_call_id_token_count instead. | |
| """ | |
| # Find the last occurrence of <|tool_call:begin|> for multi-tool-call support | |
| try: | |
| # Reverse search for the last <|tool_call:begin|> | |
| reversed_index = output_token_ids[::-1].index(TOOL_CALL_BEGIN_TOKEN_ID) | |
| last_begin_index = len(output_token_ids) - 1 - reversed_index | |
| return len(output_token_ids) - last_begin_index - 1 | |
| except ValueError: | |
| return 0 | |
| def __call__( | |
| self, | |
| output_token_ids: list[int], | |
| logits: torch.Tensor, | |
| ) -> torch.Tensor: | |
| # Update state incrementally (only process new tokens) | |
| self._update_state_incremental(output_token_ids) | |
| state = self._state | |
| # Handle structured outputs mode | |
| if self._is_structured_outputs: | |
| if not self._is_reasoning_request: | |
| # Non-reasoning request with structured outputs: no logit control | |
| return logits | |
| else: | |
| # Reasoning request with structured outputs: | |
| # Control logits only during reasoning phase | |
| if state not in self._REASONING_STATES: | |
| # Reasoning finished, let structured outputs handle it | |
| return logits | |
| if state == GenerationState.INITIAL: | |
| if self._is_reasoning_request: | |
| # Force: <|think|> only (reasoning request must start with think) | |
| think_logit = logits[THINK_TOKEN_ID].clone() | |
| logits.fill_(NEG_INF) | |
| logits[THINK_TOKEN_ID] = think_logit | |
| else: | |
| # Allow: <|content|>, <|tool_calls|> only | |
| content_logit = logits[CONTENT_TOKEN_ID].clone() | |
| tool_calls_logit = logits[TOOL_CALLS_TOKEN_ID].clone() | |
| logits.fill_(NEG_INF) | |
| logits[CONTENT_TOKEN_ID] = content_logit | |
| logits[TOOL_CALLS_TOKEN_ID] = tool_calls_logit | |
| elif state in (GenerationState.THINK_BEGIN, GenerationState.THINK_IN_PROGRESS): | |
| # Check if reasoning budget is exceeded (using incremental counter) | |
| if ( | |
| self._reasoning_budget is not None | |
| and state == GenerationState.THINK_IN_PROGRESS | |
| ): | |
| if self._think_token_count >= self._reasoning_budget: | |
| # Force <|end|> token to terminate reasoning | |
| logits.fill_(NEG_INF) | |
| logits[END_TOKEN_ID] = 0.0 | |
| return logits | |
| # Transform: <|flush|> -> <|end|> | |
| # Think must be followed by another message, so prevent early termination | |
| logits[END_TOKEN_ID] = torch.maximum(logits[END_TOKEN_ID], logits[FLUSH_TOKEN_ID]) | |
| # Forbid all special tokens except <|end|> | |
| logits[_SPECIAL_EXCEPT_END] = NEG_INF | |
| elif state == GenerationState.THINK_END: | |
| # Force: <|begin|> only | |
| # Think must be followed by another message | |
| logits.fill_(NEG_INF) | |
| logits[BEGIN_TOKEN_ID] = 0.0 | |
| elif state == GenerationState.NEW_MESSAGE_BEGIN: | |
| # Force: assistant token only | |
| logits.fill_(NEG_INF) | |
| logits[ASSISTANT_TOKEN_ID] = 0.0 | |
| elif state == GenerationState.NEW_MESSAGE_ASSISTANT: | |
| # Allow: <|content|>, <|tool_calls|>, regular tokens | |
| # Forbid: all other special tokens | |
| logits[_SPECIAL_EXCEPT_CONTENT_TOOLCALLS] = NEG_INF | |
| elif state in (GenerationState.CONTENT_BEGIN, GenerationState.CONTENT_IN_PROGRESS): | |
| # Transform: <|end|> -> <|flush|> | |
| # Content cannot be followed by another message | |
| logits[FLUSH_TOKEN_ID] = torch.maximum(logits[FLUSH_TOKEN_ID], logits[END_TOKEN_ID]) | |
| # Forbid all special tokens except <|flush|> | |
| logits[_SPECIAL_EXCEPT_FLUSH] = NEG_INF | |
| elif state == GenerationState.TOOL_CALLS_BEGIN: | |
| # Force: <|tool_call:begin|> only | |
| tool_call_begin_logit = logits[TOOL_CALL_BEGIN_TOKEN_ID].clone() | |
| logits.fill_(NEG_INF) | |
| logits[TOOL_CALL_BEGIN_TOKEN_ID] = tool_call_begin_logit | |
| elif state == GenerationState.TOOL_CALL_BEGIN: | |
| # Allow: regular tokens only (ID generation) | |
| # Forbid: all special tokens | |
| _forbid_all_special_tokens(logits) | |
| elif state == GenerationState.TOOL_CALL_ID_IN_PROGRESS: | |
| # Check if tool call ID budget is exceeded (using incremental counter) | |
| if self._tool_call_id_token_count >= self._tool_call_id_budget: | |
| # Force <|tool_call:name|> token to terminate ID generation | |
| logits.fill_(NEG_INF) | |
| logits[TOOL_CALL_NAME_TOKEN_ID] = 0.0 | |
| return logits | |
| # Allow: <|tool_call:name|>, regular tokens | |
| # Forbid: all other special tokens | |
| logits[_SPECIAL_EXCEPT_TOOLCALL_NAME] = NEG_INF | |
| elif state == GenerationState.TOOL_CALL_NAME_BEGIN: | |
| # Allow: regular tokens only (function name generation) | |
| # Forbid: all special tokens | |
| _forbid_all_special_tokens(logits) | |
| elif state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS: | |
| # Allow: <|tool_call:args|>, regular tokens | |
| # Forbid: all other special tokens | |
| logits[_SPECIAL_EXCEPT_TOOLCALL_ARGS] = NEG_INF | |
| elif state == GenerationState.TOOL_CALL_ARGS_BEGIN: | |
| # Allow: regular tokens only (JSON args generation) | |
| # Forbid: all special tokens | |
| _forbid_all_special_tokens(logits) | |
| elif state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS: | |
| # Allow: <|tool_call:end|>, regular tokens | |
| # Forbid: all other special tokens | |
| logits[_SPECIAL_EXCEPT_TOOLCALL_END] = NEG_INF | |
| elif state == GenerationState.TOOL_CALL_END: | |
| # Allow: <|tool_call:begin|> (next tool call), <|calls|> (end) | |
| # Forbid: all other special tokens | |
| tool_call_begin_logit = logits[TOOL_CALL_BEGIN_TOKEN_ID].clone() | |
| calls_logit = logits[CALLS_TOKEN_ID].clone() | |
| logits.fill_(NEG_INF) | |
| logits[TOOL_CALL_BEGIN_TOKEN_ID] = tool_call_begin_logit | |
| logits[CALLS_TOKEN_ID] = calls_logit | |
| # CALLS state: no processing needed (EOS) | |
| return logits | |
| class SolarOpenTemplateLogitsProcessor(AdapterLogitsProcessor): | |
| """ | |
| Logits processor that enforces Solar Open chat template. | |
| This processor manages the generation flow according to the | |
| Solar Open chat template by tracking generation states. | |
| """ | |
| def __init__( | |
| self, | |
| vllm_config: "VllmConfig", | |
| device: torch.device, | |
| is_pin_memory: bool, | |
| ): | |
| super().__init__(vllm_config, device, is_pin_memory) | |
| # Dynamic reasoning budget settings for HIGH effort | |
| self._high_max = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_HIGH_MAX", DEFAULT_REASONING_BUDGET_HIGH_MAX | |
| ) | |
| self._high_min = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_HIGH_MIN", DEFAULT_REASONING_BUDGET_HIGH_MIN | |
| ) | |
| self._high_ratio = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_HIGH_RATIO", DEFAULT_REASONING_BUDGET_HIGH_RATIO | |
| ) | |
| # Dynamic reasoning budget settings for MEDIUM effort | |
| self._medium_max = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_MEDIUM_MAX", DEFAULT_REASONING_BUDGET_MEDIUM_MAX | |
| ) | |
| self._medium_min = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_MEDIUM_MIN", DEFAULT_REASONING_BUDGET_MEDIUM_MIN | |
| ) | |
| self._medium_ratio = self._parse_env_int( | |
| "SOLAR_REASONING_BUDGET_MEDIUM_RATIO", DEFAULT_REASONING_BUDGET_MEDIUM_RATIO | |
| ) | |
| self._tool_call_id_budget: int = self._parse_env_int( | |
| "SOLAR_TOOL_CALL_ID_BUDGET", DEFAULT_TOOL_CALL_ID_BUDGET | |
| ) | |
| def _parse_env_int(env_var: str, default: int) -> int: | |
| """Parse environment variable as integer, return default if not set or invalid.""" | |
| value = os.environ.get(env_var) | |
| if value is None: | |
| return default | |
| try: | |
| return int(value) | |
| except ValueError: | |
| return default | |
| def _calculate_reasoning_budget(self, effort: str, max_tokens: int) -> int: | |
| """Calculate dynamic reasoning budget based on effort level and max_tokens. | |
| Priority (higher priority conditions are applied first): | |
| 1. max_budget: Upper limit for reasoning tokens | |
| 2. min_budget: Lower limit for reasoning tokens | |
| 3. ratio: Percentage of max_tokens allocated for reasoning (e.g., 60 means 60%) | |
| budget = min(max_budget, max(min_budget, max_tokens * ratio / 100)) | |
| """ | |
| if effort == "high": | |
| max_budget = self._high_max | |
| min_budget = self._high_min | |
| ratio = self._high_ratio | |
| elif effort == "medium": | |
| max_budget = self._medium_max | |
| min_budget = self._medium_min | |
| ratio = self._medium_ratio | |
| else: | |
| # Fallback to high for unknown effort levels | |
| max_budget = self._high_max | |
| min_budget = self._high_min | |
| ratio = self._high_ratio | |
| # Calculate ratio-based budget (ratio is percentage, e.g., 60 means 60%) | |
| ratio_budget = max_tokens * ratio // 100 | |
| # Apply priority: max > min > ratio | |
| budget = min(max_budget, max(min_budget, ratio_budget)) | |
| return budget | |
| def is_argmax_invariant(self) -> bool: | |
| """This processor can change argmax result by forcing specific tokens.""" | |
| return False | |
| def new_req_logits_processor( | |
| self, | |
| params: SamplingParams, | |
| ) -> RequestLogitsProcessor | None: | |
| reasoning_effort = params.reasoning_effort or DEFAULT_REASONING_EFFORT | |
| reasoning_budget = self._calculate_reasoning_budget( | |
| reasoning_effort, params.max_tokens | |
| ) | |
| return SolarOpenTemplateEnforcer( | |
| is_reasoning_request=is_reasoning_request(params), | |
| is_structured_outputs=is_structured_outputs(params), | |
| reasoning_budget=reasoning_budget, | |
| tool_call_id_budget=self._tool_call_id_budget, | |
| ) | |