{ "deployment_metadata": { "model_name": "Helion-V2.0-Thinking", "version": "2.0.0", "deployment_date": "2024-11-27", "supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"], "minimum_transformers_version": "4.36.0" }, "server_configurations": { "development": { "environment": "dev", "host": "0.0.0.0", "port": 8000, "workers": 1, "max_batch_size": 1, "max_concurrent_requests": 4, "timeout_seconds": 300, "enable_cors": true, "cors_origins": ["*"], "log_level": "DEBUG", "cache_enabled": true, "metrics_enabled": true }, "production": { "environment": "prod", "host": "0.0.0.0", "port": 8000, "workers": 4, "max_batch_size": 8, "max_concurrent_requests": 32, "timeout_seconds": 180, "enable_cors": true, "cors_origins": ["https://yourdomain.com"], "log_level": "INFO", "cache_enabled": true, "metrics_enabled": true, "health_check_enabled": true, "auto_scaling": true } }, "vllm_config": { "gpu_memory_utilization": 0.9, "max_num_seqs": 256, "max_num_batched_tokens": 8192, "max_model_len": 200000, "trust_remote_code": true, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "dtype": "bfloat16", "quantization": null, "enforce_eager": false, "enable_chunked_prefill": true, "max_num_on_the_fly": 8, "enable_prefix_caching": true, "disable_custom_all_reduce": false }, "text_generation_inference": { "max_concurrent_requests": 128, "max_best_of": 4, "max_stop_sequences": 4, "max_input_length": 199000, "max_total_tokens": 200000, "waiting_served_ratio": 1.2, "max_batch_prefill_tokens": 4096, "max_batch_total_tokens": 200000, "max_waiting_tokens": 20, "hostname": "0.0.0.0", "port": 8080, "master_shard_uds_path": "/tmp/text-generation-server", "tokenizer_name": "DeepXR/Helion-V2.0-Thinking", "revision": "main", "validation_workers": 2, "json_output": false, "otlp_endpoint": null, "cors_allow_origin": "*", "watermark_gamma": null, "watermark_delta": null }, "ollama_modelfile": { "from": "DeepXR/Helion-V2.0-Thinking", "template": "[INST] {{ .System }} {{ .Prompt }} [/INST]", "parameter": { "temperature": 0.7, "top_p": 0.9, "top_k": 50, "num_ctx": 200000, "num_predict": 2048, "stop": ["", "<|end|>"], "repeat_penalty": 1.1, "seed": -1 }, "system": "You are Helion, a helpful AI assistant with vision and tool use capabilities." }, "api_endpoints": { "generate": { "path": "/v1/generate", "method": "POST", "rate_limit": "100/minute", "request_schema": { "prompt": "string (required)", "max_tokens": "integer (optional, default: 1024)", "temperature": "float (optional, default: 0.7)", "top_p": "float (optional, default: 0.9)", "stream": "boolean (optional, default: false)", "images": "array (optional)" } }, "chat": { "path": "/v1/chat/completions", "method": "POST", "rate_limit": "100/minute", "openai_compatible": true, "request_schema": { "messages": "array (required)", "model": "string (required)", "temperature": "float (optional)", "stream": "boolean (optional)" } }, "embeddings": { "path": "/v1/embeddings", "method": "POST", "rate_limit": "200/minute", "enabled": false }, "health": { "path": "/health", "method": "GET", "public": true }, "metrics": { "path": "/metrics", "method": "GET", "format": "prometheus", "public": false } }, "load_balancing": { "strategy": "round_robin", "health_check_interval_seconds": 30, "unhealthy_threshold": 3, "healthy_threshold": 2, "sticky_sessions": false, "session_affinity_ttl_seconds": 3600 }, "caching": { "enabled": true, "backend": "redis", "redis": { "host": "localhost", "port": 6379, "db": 0, "password": null, "ssl": false, "ttl_seconds": 3600, "max_connections": 50 }, "cache_keys": { "prompt_prefix": "helion:prompt:", "result_prefix": "helion:result:", "metrics_prefix": "helion:metrics:" }, "cache_policies": { "identical_prompts": true, "similar_prompts": false, "max_cache_size_mb": 1024 } }, "monitoring": { "prometheus": { "enabled": true, "port": 9090, "metrics": [ "request_count", "request_duration_seconds", "token_generation_rate", "gpu_memory_usage", "active_requests", "queue_size" ] }, "logging": { "format": "json", "output": "stdout", "level": "INFO", "include_request_body": false, "include_response_body": false, "log_rotation": { "enabled": true, "max_size_mb": 100, "max_files": 10 } }, "tracing": { "enabled": false, "backend": "jaeger", "sampling_rate": 0.1 } }, "security": { "authentication": { "enabled": true, "type": "api_key", "api_key_header": "X-API-Key", "rate_limiting": true }, "rate_limiting": { "enabled": true, "requests_per_minute": 100, "requests_per_hour": 5000, "burst_size": 10, "strategy": "sliding_window" }, "input_validation": { "max_prompt_length": 199000, "max_image_size_mb": 20, "max_images_per_request": 10, "allowed_image_formats": ["jpg", "jpeg", "png", "webp"], "sanitize_inputs": true }, "output_filtering": { "enabled": true, "pii_detection": true, "toxicity_filtering": true, "content_policy_enforcement": true } }, "resource_management": { "gpu": { "memory_fraction": 0.95, "allow_growth": true, "per_process_gpu_memory_fraction": 0.9, "visible_devices": "0", "multi_gpu_strategy": "model_parallel" }, "cpu": { "num_threads": 8, "num_workers": 4, "affinity_enabled": false }, "memory": { "max_memory_gb": 64, "swap_enabled": false, "oom_handling": "graceful_degradation" } }, "auto_scaling": { "enabled": false, "min_replicas": 1, "max_replicas": 10, "target_gpu_utilization": 0.7, "target_request_rate": 50, "scale_up_threshold": 0.8, "scale_down_threshold": 0.3, "cooldown_period_seconds": 300 }, "backup_and_recovery": { "checkpoint_enabled": false, "checkpoint_interval_hours": 24, "checkpoint_path": "/data/checkpoints", "max_checkpoints": 5, "recovery_strategy": "latest_checkpoint" }, "experimental_features": { "speculative_decoding": false, "continuous_batching": true, "dynamic_batching": true, "model_compilation": false, "mixed_precision": true, "gradient_checkpointing": false }, "model_serving_options": { "triton_inference_server": { "enabled": false, "model_repository": "/models", "backend": "python", "max_batch_size": 8, "dynamic_batching": true }, "torchserve": { "enabled": false, "model_store": "/model_store", "batch_size": 4, "workers": 2 }, "ray_serve": { "enabled": false, "num_replicas": 2, "max_concurrent_queries": 16 } }, "cloud_deployment": { "aws": { "instance_type": "p4d.24xlarge", "region": "us-east-1", "use_spot_instances": false, "s3_model_path": "s3://your-bucket/models/helion-v2-thinking" }, "gcp": { "machine_type": "a2-highgpu-8g", "region": "us-central1", "preemptible": false, "gcs_model_path": "gs://your-bucket/models/helion-v2-thinking" }, "azure": { "vm_size": "Standard_NC96ads_A100_v4", "region": "eastus", "spot_instance": false, "blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking" } } }