DeepXR
/

Helion-V2.0-Thinking

+{
+  "deployment_metadata": {
+    "model_name": "Helion-V2.0-Thinking",
+    "version": "2.0.0",
+    "deployment_date": "2024-11-27",
+    "supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"],
+    "minimum_transformers_version": "4.36.0"
+  },
+  "server_configurations": {
+    "development": {
+      "environment": "dev",
+      "host": "0.0.0.0",
+      "port": 8000,
+      "workers": 1,
+      "max_batch_size": 1,
+      "max_concurrent_requests": 4,
+      "timeout_seconds": 300,
+      "enable_cors": true,
+      "cors_origins": ["*"],
+      "log_level": "DEBUG",
+      "cache_enabled": true,
+      "metrics_enabled": true
+    },
+    "production": {
+      "environment": "prod",
+      "host": "0.0.0.0",
+      "port": 8000,
+      "workers": 4,
+      "max_batch_size": 8,
+      "max_concurrent_requests": 32,
+      "timeout_seconds": 180,
+      "enable_cors": true,
+      "cors_origins": ["https://yourdomain.com"],
+      "log_level": "INFO",
+      "cache_enabled": true,
+      "metrics_enabled": true,
+      "health_check_enabled": true,
+      "auto_scaling": true
+    }
+  },
+  "vllm_config": {
+    "gpu_memory_utilization": 0.9,
+    "max_num_seqs": 256,
+    "max_num_batched_tokens": 8192,
+    "max_model_len": 200000,
+    "trust_remote_code": true,
+    "tensor_parallel_size": 1,
+    "pipeline_parallel_size": 1,
+    "dtype": "bfloat16",
+    "quantization": null,
+    "enforce_eager": false,
+    "enable_chunked_prefill": true,
+    "max_num_on_the_fly": 8,
+    "enable_prefix_caching": true,
+    "disable_custom_all_reduce": false
+  },
+  "text_generation_inference": {
+    "max_concurrent_requests": 128,
+    "max_best_of": 4,
+    "max_stop_sequences": 4,
+    "max_input_length": 199000,
+    "max_total_tokens": 200000,
+    "waiting_served_ratio": 1.2,
+    "max_batch_prefill_tokens": 4096,
+    "max_batch_total_tokens": 200000,
+    "max_waiting_tokens": 20,
+    "hostname": "0.0.0.0",
+    "port": 8080,
+    "master_shard_uds_path": "/tmp/text-generation-server",
+    "tokenizer_name": "DeepXR/Helion-V2.0-Thinking",
+    "revision": "main",
+    "validation_workers": 2,
+    "json_output": false,
+    "otlp_endpoint": null,
+    "cors_allow_origin": "*",
+    "watermark_gamma": null,
+    "watermark_delta": null
+  },
+  "ollama_modelfile": {
+    "from": "DeepXR/Helion-V2.0-Thinking",
+    "template": "[INST] {{ .System }} {{ .Prompt }} [/INST]",
+    "parameter": {
+      "temperature": 0.7,
+      "top_p": 0.9,
+      "top_k": 50,
+      "num_ctx": 200000,
+      "num_predict": 2048,
+      "stop": ["</s>", "<|end|>"],
+      "repeat_penalty": 1.1,
+      "seed": -1
+    },
+    "system": "You are Helion, a helpful AI assistant with vision and tool use capabilities."
+  },
+  "api_endpoints": {
+    "generate": {
+      "path": "/v1/generate",
+      "method": "POST",
+      "rate_limit": "100/minute",
+      "request_schema": {
+        "prompt": "string (required)",
+        "max_tokens": "integer (optional, default: 1024)",
+        "temperature": "float (optional, default: 0.7)",
+        "top_p": "float (optional, default: 0.9)",
+        "stream": "boolean (optional, default: false)",
+        "images": "array<base64> (optional)"
+      }
+    },
+    "chat": {
+      "path": "/v1/chat/completions",
+      "method": "POST",
+      "rate_limit": "100/minute",
+      "openai_compatible": true,
+      "request_schema": {
+        "messages": "array (required)",
+        "model": "string (required)",
+        "temperature": "float (optional)",
+        "stream": "boolean (optional)"
+      }
+    },
+    "embeddings": {
+      "path": "/v1/embeddings",
+      "method": "POST",
+      "rate_limit": "200/minute",
+      "enabled": false
+    },
+    "health": {
+      "path": "/health",
+      "method": "GET",
+      "public": true
+    },
+    "metrics": {
+      "path": "/metrics",
+      "method": "GET",
+      "format": "prometheus",
+      "public": false
+    }
+  },
+  "load_balancing": {
+    "strategy": "round_robin",
+    "health_check_interval_seconds": 30,
+    "unhealthy_threshold": 3,
+    "healthy_threshold": 2,
+    "sticky_sessions": false,
+    "session_affinity_ttl_seconds": 3600
+  },
+  "caching": {
+    "enabled": true,
+    "backend": "redis",
+    "redis": {
+      "host": "localhost",
+      "port": 6379,
+      "db": 0,
+      "password": null,
+      "ssl": false,
+      "ttl_seconds": 3600,
+      "max_connections": 50
+    },
+    "cache_keys": {
+      "prompt_prefix": "helion:prompt:",
+      "result_prefix": "helion:result:",
+      "metrics_prefix": "helion:metrics:"
+    },
+    "cache_policies": {
+      "identical_prompts": true,
+      "similar_prompts": false,
+      "max_cache_size_mb": 1024
+    }
+  },
+  "monitoring": {
+    "prometheus": {
+      "enabled": true,
+      "port": 9090,
+      "metrics": [
+        "request_count",
+        "request_duration_seconds",
+        "token_generation_rate",
+        "gpu_memory_usage",
+        "active_requests",
+        "queue_size"
+      ]
+    },
+    "logging": {
+      "format": "json",
+      "output": "stdout",
+      "level": "INFO",
+      "include_request_body": false,
+      "include_response_body": false,
+      "log_rotation": {
+        "enabled": true,
+        "max_size_mb": 100,
+        "max_files": 10
+      }
+    },
+    "tracing": {
+      "enabled": false,
+      "backend": "jaeger",
+      "sampling_rate": 0.1
+    }
+  },
+  "security": {
+    "authentication": {
+      "enabled": true,
+      "type": "api_key",
+      "api_key_header": "X-API-Key",
+      "rate_limiting": true
+    },
+    "rate_limiting": {
+      "enabled": true,
+      "requests_per_minute": 100,
+      "requests_per_hour": 5000,
+      "burst_size": 10,
+      "strategy": "sliding_window"
+    },
+    "input_validation": {
+      "max_prompt_length": 199000,
+      "max_image_size_mb": 20,
+      "max_images_per_request": 10,
+      "allowed_image_formats": ["jpg", "jpeg", "png", "webp"],
+      "sanitize_inputs": true
+    },
+    "output_filtering": {
+      "enabled": true,
+      "pii_detection": true,
+      "toxicity_filtering": true,
+      "content_policy_enforcement": true
+    }
+  },
+  "resource_management": {
+    "gpu": {
+      "memory_fraction": 0.95,
+      "allow_growth": true,
+      "per_process_gpu_memory_fraction": 0.9,
+      "visible_devices": "0",
+      "multi_gpu_strategy": "model_parallel"
+    },
+    "cpu": {
+      "num_threads": 8,
+      "num_workers": 4,
+      "affinity_enabled": false
+    },
+    "memory": {
+      "max_memory_gb": 64,
+      "swap_enabled": false,
+      "oom_handling": "graceful_degradation"
+    }
+  },
+  "auto_scaling": {
+    "enabled": false,
+    "min_replicas": 1,
+    "max_replicas": 10,
+    "target_gpu_utilization": 0.7,
+    "target_request_rate": 50,
+    "scale_up_threshold": 0.8,
+    "scale_down_threshold": 0.3,
+    "cooldown_period_seconds": 300
+  },
+  "backup_and_recovery": {
+    "checkpoint_enabled": false,
+    "checkpoint_interval_hours": 24,
+    "checkpoint_path": "/data/checkpoints",
+    "max_checkpoints": 5,
+    "recovery_strategy": "latest_checkpoint"
+  },
+  "experimental_features": {
+    "speculative_decoding": false,
+    "continuous_batching": true,
+    "dynamic_batching": true,
+    "model_compilation": false,
+    "mixed_precision": true,
+    "gradient_checkpointing": false
+  },
+  "model_serving_options": {
+    "triton_inference_server": {
+      "enabled": false,
+      "model_repository": "/models",
+      "backend": "python",
+      "max_batch_size": 8,
+      "dynamic_batching": true
+    },
+    "torchserve": {
+      "enabled": false,
+      "model_store": "/model_store",
+      "batch_size": 4,
+      "workers": 2
+    },
+    "ray_serve": {
+      "enabled": false,
+      "num_replicas": 2,
+      "max_concurrent_queries": 16
+    }
+  },
+  "cloud_deployment": {
+    "aws": {
+      "instance_type": "p4d.24xlarge",
+      "region": "us-east-1",
+      "use_spot_instances": false,
+      "s3_model_path": "s3://your-bucket/models/helion-v2-thinking"
+    },
+    "gcp": {
+      "machine_type": "a2-highgpu-8g",
+      "region": "us-central1",
+      "preemptible": false,
+      "gcs_model_path": "gs://your-bucket/models/helion-v2-thinking"
+    },
+    "azure": {
+      "vm_size": "Standard_NC96ads_A100_v4",
+      "region": "eastus",
+      "spot_instance": false,
+      "blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking"
+    }
+  }
+}