DeepXR
/

Helion-V2.0-Thinking

+"""
+Helion-V2.0-Thinking Inference Script
+A comprehensive example showing different ways to use the multimodal model
+with vision, tool use, and structured output capabilities
+"""
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoProcessor,
+    BitsAndBytesConfig
+)
+from PIL import Image
+import requests
+from typing import Optional, List, Dict, Any
+import argparse
+import json
+import re
+class HelionInference:
+    """Wrapper class for Helion-V2.0-Thinking multimodal model inference"""
+    def __init__(
+        self,
+        model_name: str = "DeepXR/Helion-V2.0-Thinking",
+        device: str = "auto",
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        use_flash_attention: bool = True
+    ):
+        """
+        Initialize the model, tokenizer, and processor
+        Args:
+            model_name: HuggingFace model identifier
+            device: Device to load model on ('auto', 'cuda', 'cpu')
+            load_in_8bit: Enable 8-bit quantization
+            load_in_4bit: Enable 4-bit quantization
+            use_flash_attention: Use Flash Attention 2 for efficiency
+        """
+        print(f"Loading {model_name}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        # Configure quantization if requested
+        quantization_config = None
+        if load_in_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+        elif load_in_8bit:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        # Load model
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+            quantization_config=quantization_config,
+            use_flash_attention_2=use_flash_attention,
+            trust_remote_code=True
+        )
+        self.model.eval()
+        print("Model loaded successfully!")
+        # Tool definitions
+        self.tools = self._initialize_tools()
+    def _initialize_tools(self) -> List[Dict[str, Any]]:
+        """Initialize available tools for function calling"""
+        return [
+            {
+                "name": "calculator",
+                "description": "Perform mathematical calculations",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "expression": {
+                            "type": "string",
+                            "description": "Mathematical expression to evaluate"
+                        }
+                    },
+                    "required": ["expression"]
+                }
+            },
+            {
+                "name": "web_search",
+                "description": "Search the web for current information",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "The search query"
+                        }
+                    },
+                    "required": ["query"]
+                }
+            },
+            {
+                "name": "code_executor",
+                "description": "Execute Python code safely",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "code": {
+                            "type": "string",
+                            "description": "Python code to execute"
+                        }
+                    },
+                    "required": ["code"]
+                }
+            }
+        ]
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        repetition_penalty: float = 1.1,
+        do_sample: bool = True,
+        images: Optional[List[Image.Image]] = None
+    ) -> str:
+        """
+        Generate text from a prompt with optional images
+        Args:
+            prompt: Input text
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Nucleus sampling threshold
+            top_k: Top-k sampling parameter
+            repetition_penalty: Penalty for repeating tokens
+            do_sample: Use sampling vs greedy decoding
+            images: Optional list of PIL images
+        Returns:
+            Generated text
+        """
+        if images:
+            inputs = self.processor(
+                text=prompt,
+                images=images,
+                return_tensors="pt"
+            ).to(self.model.device)
+        else:
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                do_sample=do_sample,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # Decode and return
+        if images:
+            generated_text = self.processor.decode(outputs[0], skip_special_tokens=True)
+        else:
+            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Remove the prompt from output
+        if generated_text.startswith(prompt):
+            generated_text = generated_text[len(prompt):].strip()
+        return generated_text
+    def analyze_image(
+        self,
+        image: Image.Image,
+        query: str = "Describe this image in detail.",
+        max_new_tokens: int = 512
+    ) -> str:
+        """
+        Analyze an image with a specific query
+        Args:
+            image: PIL Image object
+            query: Question or instruction about the image
+            max_new_tokens: Maximum tokens to generate
+        Returns:
+            Image analysis response
+        """
+        return self.generate(
+            prompt=query,
+            images=[image],
+            max_new_tokens=max_new_tokens,
+            temperature=0.7
+        )
+    def extract_text_from_image(
+        self,
+        image: Image.Image
+    ) -> str:
+        """
+        Perform OCR on an image
+        Args:
+            image: PIL Image object
+        Returns:
+            Extracted text
+        """
+        prompt = "Extract all text from this image. Return only the text content without any additional commentary."
+        return self.generate(
+            prompt=prompt,
+            images=[image],
+            max_new_tokens=1024,
+            temperature=0.3
+        )
+    def call_function(
+        self,
+        prompt: str,
+        tools: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, Any]:
+        """
+        Use function calling to determine which tool to use
+        Args:
+            prompt: User query
+            tools: List of available tools (uses default if None)
+        Returns:
+            Dict with tool name and parameters
+        """
+        if tools is None:
+            tools = self.tools
+        system_prompt = f"""You are a helpful assistant with access to the following tools:
+{json.dumps(tools, indent=2)}
+To use a tool, respond with ONLY a JSON object in this exact format:
+{{"tool": "tool_name", "parameters": {{"param": "value"}}}}
+Do not include any other text or explanation."""
+        full_prompt = f"{system_prompt}\n\nUser query: {prompt}\n\nTool call:"
+        response = self.generate(
+            prompt=full_prompt,
+            max_new_tokens=256,
+            temperature=0.2,
+            do_sample=False
+        )
+        # Parse JSON response
+        try:
+            # Extract JSON from response
+            json_match = re.search(r'\{.*\}', response, re.DOTALL)
+            if json_match:
+                tool_call = json.loads(json_match.group())
+                return tool_call
+            else:
+                return {"error": "No valid JSON found in response", "raw": response}
+        except json.JSONDecodeError as e:
+            return {"error": f"JSON decode error: {str(e)}", "raw": response}
+    def structured_output(
+        self,
+        prompt: str,
+        schema: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Generate structured JSON output matching a schema
+        Args:
+            prompt: Input prompt
+            schema: JSON schema for the output
+        Returns:
+            Parsed JSON response
+        """
+        full_prompt = f"""Generate a JSON response matching this schema:
+{json.dumps(schema, indent=2)}
+User request: {prompt}
+Return ONLY valid JSON, no other text:"""
+        response = self.generate(
+            prompt=full_prompt,
+            max_new_tokens=1024,
+            temperature=0.2,
+            do_sample=False
+        )
+        # Parse JSON response
+        try:
+            # Try to extract JSON from markdown code blocks
+            if "```json" in response:
+                json_str = response.split("```json")[-1].split("```")[0].strip()
+            elif "```" in response:
+                json_str = response.split("```")[1].strip()
+            else:
+                json_str = response.strip()
+            return json.loads(json_str)
+        except json.JSONDecodeError as e:
+            return {"error": f"JSON decode error: {str(e)}", "raw": response}
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9
+    ) -> str:
+        """
+        Chat interface using conversation format with support for images
+        Args:
+            messages: List of message dicts with 'role', 'content', and optional 'images' keys
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Nucleus sampling threshold
+        Returns:
+            Assistant's response
+        """
+        # Extract images from messages
+        all_images = []
+        for msg in messages:
+            if "images" in msg and msg["images"]:
+                all_images.extend(msg["images"])
+        # Apply chat template
+        prompt = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        return self.generate(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            images=all_images if all_images else None
+        )
+    def interactive_chat(self):
+        """Run an interactive chat session with multimodal support"""
+        print("\n" + "="*60)
+        print("Helion-V2.0-Thinking Interactive Chat")
+        print("Commands:")
+        print("  - Type 'exit' or 'quit' to end")
+        print("  - Type 'image <path>' to add an image")
+        print("  - Type 'clear' to reset conversation")
+        print("="*60 + "\n")
+        conversation_history = []
+        while True:
+            user_input = input("You: ").strip()
+            if user_input.lower() in ['exit', 'quit', 'q']:
+                print("Goodbye!")
+                break
+            if user_input.lower() == 'clear':
+                conversation_history = []
+                print("Conversation cleared.\n")
+                continue
+            if not user_input:
+                continue
+            # Check for image command
+            images = []
+            if user_input.lower().startswith('image '):
+                image_path = user_input[6:].strip()
+                try:
+                    image = Image.open(image_path)
+                    images.append(image)
+                    print(f"Image loaded: {image_path}")
+                    user_input = input("Your question about the image: ").strip()
+                except Exception as e:
+                    print(f"Error loading image: {e}")
+                    continue
+            # Add user message to history
+            message = {
+                "role": "user",
+                "content": user_input
+            }
+            if images:
+                message["images"] = images
+            conversation_history.append(message)
+            # Generate response
+            try:
+                response = self.chat(conversation_history)
+                # Add assistant response to history
+                conversation_history.append({
+                    "role": "assistant",
+                    "content": response
+                })
+                print(f"\nAssistant: {response}\n")
+            except Exception as e:
+                print(f"Error generating response: {e}\n")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Helion-V2.0-Thinking Multimodal Inference"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="DeepXR/Helion-V2.0-Thinking",
+        help="Model name or path"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Input prompt for generation"
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        help="Path to image file"
+    )
+    parser.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Start interactive chat mode"
+    )
+    parser.add_argument(
+        "--load-in-8bit",
+        action="store_true",
+        help="Load model in 8-bit precision"
+    )
+    parser.add_argument(
+        "--load-in-4bit",
+        action="store_true",
+        help="Load model in 4-bit precision"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=512,
+        help="Maximum tokens to generate"
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--demo",
+        action="store_true",
+        help="Run demonstration examples"
+    )
+    args = parser.parse_args()
+    # Initialize model
+    model = HelionInference(
+        model_name=args.model,
+        load_in_8bit=args.load_in_8bit,
+        load_in_4bit=args.load_in_4bit
+    )
+    # Run interactive mode or examples
+    if args.interactive:
+        model.interactive_chat()
+    elif args.demo:
+        print("\n" + "="*60)
+        print("Running Demonstration Examples")
+        print("="*60 + "\n")
+        # Text generation example
+        print("1. Text Generation:")
+        print("-" * 40)
+        response = model.generate(
+            "Explain quantum entanglement in simple terms:",
+            max_new_tokens=256
+        )
+        print(f"Response: {response}\n")
+        # Function calling example
+        print("2. Function Calling:")
+        print("-" * 40)
+        tool_call = model.call_function(
+            "What is 45 multiplied by 23, plus 156?"
+        )
+        print(f"Tool call: {json.dumps(tool_call, indent=2)}\n")
+        # Structured output example
+        print("3. Structured Output:")
+        print("-" * 40)
+        schema = {
+            "type": "object",
+            "properties": {
+                "summary": {"type": "string"},
+                "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
+                "key_points": {"type": "array", "items": {"type": "string"}}
+            }
+        }
+        structured = model.structured_output(
+            "Analyze this: The new product launch was highly successful.",
+            schema
+        )
+        print(f"Structured output: {json.dumps(structured, indent=2)}\n")
+    elif args.image:
+        # Image analysis
+        try:
+            image = Image.open(args.image)
+            prompt = args.prompt or "Describe this image in detail."
+            response = model.analyze_image(image, prompt, args.max_tokens)
+            print(f"\nImage: {args.image}")
+            print(f"Query: {prompt}")
+            print(f"Response: {response}\n")
+        except Exception as e:
+            print(f"Error processing image: {e}")
+    elif args.prompt:
+        response = model.generate(
+            prompt=args.prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature
+        )
+        print(f"\nPrompt: {args.prompt}")
+        print(f"Response: {response}\n")
+    else:
+        print("Please specify --interactive, --demo, --prompt, or --image")
+        print("Use --help for more information")
+if __name__ == "__main__":
+    main()