| { | |
| "deployment_metadata": { | |
| "model_name": "Helion-V2.0-Thinking", | |
| "version": "2.0.0", | |
| "deployment_date": "2024-11-27", | |
| "supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"], | |
| "minimum_transformers_version": "4.36.0" | |
| }, | |
| "server_configurations": { | |
| "development": { | |
| "environment": "dev", | |
| "host": "0.0.0.0", | |
| "port": 8000, | |
| "workers": 1, | |
| "max_batch_size": 1, | |
| "max_concurrent_requests": 4, | |
| "timeout_seconds": 300, | |
| "enable_cors": true, | |
| "cors_origins": ["*"], | |
| "log_level": "DEBUG", | |
| "cache_enabled": true, | |
| "metrics_enabled": true | |
| }, | |
| "production": { | |
| "environment": "prod", | |
| "host": "0.0.0.0", | |
| "port": 8000, | |
| "workers": 4, | |
| "max_batch_size": 8, | |
| "max_concurrent_requests": 32, | |
| "timeout_seconds": 180, | |
| "enable_cors": true, | |
| "cors_origins": ["https://yourdomain.com"], | |
| "log_level": "INFO", | |
| "cache_enabled": true, | |
| "metrics_enabled": true, | |
| "health_check_enabled": true, | |
| "auto_scaling": true | |
| } | |
| }, | |
| "vllm_config": { | |
| "gpu_memory_utilization": 0.9, | |
| "max_num_seqs": 256, | |
| "max_num_batched_tokens": 8192, | |
| "max_model_len": 200000, | |
| "trust_remote_code": true, | |
| "tensor_parallel_size": 1, | |
| "pipeline_parallel_size": 1, | |
| "dtype": "bfloat16", | |
| "quantization": null, | |
| "enforce_eager": false, | |
| "enable_chunked_prefill": true, | |
| "max_num_on_the_fly": 8, | |
| "enable_prefix_caching": true, | |
| "disable_custom_all_reduce": false | |
| }, | |
| "text_generation_inference": { | |
| "max_concurrent_requests": 128, | |
| "max_best_of": 4, | |
| "max_stop_sequences": 4, | |
| "max_input_length": 199000, | |
| "max_total_tokens": 200000, | |
| "waiting_served_ratio": 1.2, | |
| "max_batch_prefill_tokens": 4096, | |
| "max_batch_total_tokens": 200000, | |
| "max_waiting_tokens": 20, | |
| "hostname": "0.0.0.0", | |
| "port": 8080, | |
| "master_shard_uds_path": "/tmp/text-generation-server", | |
| "tokenizer_name": "DeepXR/Helion-V2.0-Thinking", | |
| "revision": "main", | |
| "validation_workers": 2, | |
| "json_output": false, | |
| "otlp_endpoint": null, | |
| "cors_allow_origin": "*", | |
| "watermark_gamma": null, | |
| "watermark_delta": null | |
| }, | |
| "ollama_modelfile": { | |
| "from": "DeepXR/Helion-V2.0-Thinking", | |
| "template": "[INST] {{ .System }} {{ .Prompt }} [/INST]", | |
| "parameter": { | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "top_k": 50, | |
| "num_ctx": 200000, | |
| "num_predict": 2048, | |
| "stop": ["</s>", "<|end|>"], | |
| "repeat_penalty": 1.1, | |
| "seed": -1 | |
| }, | |
| "system": "You are Helion, a helpful AI assistant with vision and tool use capabilities." | |
| }, | |
| "api_endpoints": { | |
| "generate": { | |
| "path": "/v1/generate", | |
| "method": "POST", | |
| "rate_limit": "100/minute", | |
| "request_schema": { | |
| "prompt": "string (required)", | |
| "max_tokens": "integer (optional, default: 1024)", | |
| "temperature": "float (optional, default: 0.7)", | |
| "top_p": "float (optional, default: 0.9)", | |
| "stream": "boolean (optional, default: false)", | |
| "images": "array<base64> (optional)" | |
| } | |
| }, | |
| "chat": { | |
| "path": "/v1/chat/completions", | |
| "method": "POST", | |
| "rate_limit": "100/minute", | |
| "openai_compatible": true, | |
| "request_schema": { | |
| "messages": "array (required)", | |
| "model": "string (required)", | |
| "temperature": "float (optional)", | |
| "stream": "boolean (optional)" | |
| } | |
| }, | |
| "embeddings": { | |
| "path": "/v1/embeddings", | |
| "method": "POST", | |
| "rate_limit": "200/minute", | |
| "enabled": false | |
| }, | |
| "health": { | |
| "path": "/health", | |
| "method": "GET", | |
| "public": true | |
| }, | |
| "metrics": { | |
| "path": "/metrics", | |
| "method": "GET", | |
| "format": "prometheus", | |
| "public": false | |
| } | |
| }, | |
| "load_balancing": { | |
| "strategy": "round_robin", | |
| "health_check_interval_seconds": 30, | |
| "unhealthy_threshold": 3, | |
| "healthy_threshold": 2, | |
| "sticky_sessions": false, | |
| "session_affinity_ttl_seconds": 3600 | |
| }, | |
| "caching": { | |
| "enabled": true, | |
| "backend": "redis", | |
| "redis": { | |
| "host": "localhost", | |
| "port": 6379, | |
| "db": 0, | |
| "password": null, | |
| "ssl": false, | |
| "ttl_seconds": 3600, | |
| "max_connections": 50 | |
| }, | |
| "cache_keys": { | |
| "prompt_prefix": "helion:prompt:", | |
| "result_prefix": "helion:result:", | |
| "metrics_prefix": "helion:metrics:" | |
| }, | |
| "cache_policies": { | |
| "identical_prompts": true, | |
| "similar_prompts": false, | |
| "max_cache_size_mb": 1024 | |
| } | |
| }, | |
| "monitoring": { | |
| "prometheus": { | |
| "enabled": true, | |
| "port": 9090, | |
| "metrics": [ | |
| "request_count", | |
| "request_duration_seconds", | |
| "token_generation_rate", | |
| "gpu_memory_usage", | |
| "active_requests", | |
| "queue_size" | |
| ] | |
| }, | |
| "logging": { | |
| "format": "json", | |
| "output": "stdout", | |
| "level": "INFO", | |
| "include_request_body": false, | |
| "include_response_body": false, | |
| "log_rotation": { | |
| "enabled": true, | |
| "max_size_mb": 100, | |
| "max_files": 10 | |
| } | |
| }, | |
| "tracing": { | |
| "enabled": false, | |
| "backend": "jaeger", | |
| "sampling_rate": 0.1 | |
| } | |
| }, | |
| "security": { | |
| "authentication": { | |
| "enabled": true, | |
| "type": "api_key", | |
| "api_key_header": "X-API-Key", | |
| "rate_limiting": true | |
| }, | |
| "rate_limiting": { | |
| "enabled": true, | |
| "requests_per_minute": 100, | |
| "requests_per_hour": 5000, | |
| "burst_size": 10, | |
| "strategy": "sliding_window" | |
| }, | |
| "input_validation": { | |
| "max_prompt_length": 199000, | |
| "max_image_size_mb": 20, | |
| "max_images_per_request": 10, | |
| "allowed_image_formats": ["jpg", "jpeg", "png", "webp"], | |
| "sanitize_inputs": true | |
| }, | |
| "output_filtering": { | |
| "enabled": true, | |
| "pii_detection": true, | |
| "toxicity_filtering": true, | |
| "content_policy_enforcement": true | |
| } | |
| }, | |
| "resource_management": { | |
| "gpu": { | |
| "memory_fraction": 0.95, | |
| "allow_growth": true, | |
| "per_process_gpu_memory_fraction": 0.9, | |
| "visible_devices": "0", | |
| "multi_gpu_strategy": "model_parallel" | |
| }, | |
| "cpu": { | |
| "num_threads": 8, | |
| "num_workers": 4, | |
| "affinity_enabled": false | |
| }, | |
| "memory": { | |
| "max_memory_gb": 64, | |
| "swap_enabled": false, | |
| "oom_handling": "graceful_degradation" | |
| } | |
| }, | |
| "auto_scaling": { | |
| "enabled": false, | |
| "min_replicas": 1, | |
| "max_replicas": 10, | |
| "target_gpu_utilization": 0.7, | |
| "target_request_rate": 50, | |
| "scale_up_threshold": 0.8, | |
| "scale_down_threshold": 0.3, | |
| "cooldown_period_seconds": 300 | |
| }, | |
| "backup_and_recovery": { | |
| "checkpoint_enabled": false, | |
| "checkpoint_interval_hours": 24, | |
| "checkpoint_path": "/data/checkpoints", | |
| "max_checkpoints": 5, | |
| "recovery_strategy": "latest_checkpoint" | |
| }, | |
| "experimental_features": { | |
| "speculative_decoding": false, | |
| "continuous_batching": true, | |
| "dynamic_batching": true, | |
| "model_compilation": false, | |
| "mixed_precision": true, | |
| "gradient_checkpointing": false | |
| }, | |
| "model_serving_options": { | |
| "triton_inference_server": { | |
| "enabled": false, | |
| "model_repository": "/models", | |
| "backend": "python", | |
| "max_batch_size": 8, | |
| "dynamic_batching": true | |
| }, | |
| "torchserve": { | |
| "enabled": false, | |
| "model_store": "/model_store", | |
| "batch_size": 4, | |
| "workers": 2 | |
| }, | |
| "ray_serve": { | |
| "enabled": false, | |
| "num_replicas": 2, | |
| "max_concurrent_queries": 16 | |
| } | |
| }, | |
| "cloud_deployment": { | |
| "aws": { | |
| "instance_type": "p4d.24xlarge", | |
| "region": "us-east-1", | |
| "use_spot_instances": false, | |
| "s3_model_path": "s3://your-bucket/models/helion-v2-thinking" | |
| }, | |
| "gcp": { | |
| "machine_type": "a2-highgpu-8g", | |
| "region": "us-central1", | |
| "preemptible": false, | |
| "gcs_model_path": "gs://your-bucket/models/helion-v2-thinking" | |
| }, | |
| "azure": { | |
| "vm_size": "Standard_NC96ads_A100_v4", | |
| "region": "eastus", | |
| "spot_instance": false, | |
| "blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking" | |
| } | |
| } | |
| } |