Helion-V2.0-Thinking / deployment_config.json
Trouter-Library's picture
Create deployment_config.json (#3)
abae9ca verified
{
"deployment_metadata": {
"model_name": "Helion-V2.0-Thinking",
"version": "2.0.0",
"deployment_date": "2024-11-27",
"supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"],
"minimum_transformers_version": "4.36.0"
},
"server_configurations": {
"development": {
"environment": "dev",
"host": "0.0.0.0",
"port": 8000,
"workers": 1,
"max_batch_size": 1,
"max_concurrent_requests": 4,
"timeout_seconds": 300,
"enable_cors": true,
"cors_origins": ["*"],
"log_level": "DEBUG",
"cache_enabled": true,
"metrics_enabled": true
},
"production": {
"environment": "prod",
"host": "0.0.0.0",
"port": 8000,
"workers": 4,
"max_batch_size": 8,
"max_concurrent_requests": 32,
"timeout_seconds": 180,
"enable_cors": true,
"cors_origins": ["https://yourdomain.com"],
"log_level": "INFO",
"cache_enabled": true,
"metrics_enabled": true,
"health_check_enabled": true,
"auto_scaling": true
}
},
"vllm_config": {
"gpu_memory_utilization": 0.9,
"max_num_seqs": 256,
"max_num_batched_tokens": 8192,
"max_model_len": 200000,
"trust_remote_code": true,
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"quantization": null,
"enforce_eager": false,
"enable_chunked_prefill": true,
"max_num_on_the_fly": 8,
"enable_prefix_caching": true,
"disable_custom_all_reduce": false
},
"text_generation_inference": {
"max_concurrent_requests": 128,
"max_best_of": 4,
"max_stop_sequences": 4,
"max_input_length": 199000,
"max_total_tokens": 200000,
"waiting_served_ratio": 1.2,
"max_batch_prefill_tokens": 4096,
"max_batch_total_tokens": 200000,
"max_waiting_tokens": 20,
"hostname": "0.0.0.0",
"port": 8080,
"master_shard_uds_path": "/tmp/text-generation-server",
"tokenizer_name": "DeepXR/Helion-V2.0-Thinking",
"revision": "main",
"validation_workers": 2,
"json_output": false,
"otlp_endpoint": null,
"cors_allow_origin": "*",
"watermark_gamma": null,
"watermark_delta": null
},
"ollama_modelfile": {
"from": "DeepXR/Helion-V2.0-Thinking",
"template": "[INST] {{ .System }} {{ .Prompt }} [/INST]",
"parameter": {
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"num_ctx": 200000,
"num_predict": 2048,
"stop": ["</s>", "<|end|>"],
"repeat_penalty": 1.1,
"seed": -1
},
"system": "You are Helion, a helpful AI assistant with vision and tool use capabilities."
},
"api_endpoints": {
"generate": {
"path": "/v1/generate",
"method": "POST",
"rate_limit": "100/minute",
"request_schema": {
"prompt": "string (required)",
"max_tokens": "integer (optional, default: 1024)",
"temperature": "float (optional, default: 0.7)",
"top_p": "float (optional, default: 0.9)",
"stream": "boolean (optional, default: false)",
"images": "array<base64> (optional)"
}
},
"chat": {
"path": "/v1/chat/completions",
"method": "POST",
"rate_limit": "100/minute",
"openai_compatible": true,
"request_schema": {
"messages": "array (required)",
"model": "string (required)",
"temperature": "float (optional)",
"stream": "boolean (optional)"
}
},
"embeddings": {
"path": "/v1/embeddings",
"method": "POST",
"rate_limit": "200/minute",
"enabled": false
},
"health": {
"path": "/health",
"method": "GET",
"public": true
},
"metrics": {
"path": "/metrics",
"method": "GET",
"format": "prometheus",
"public": false
}
},
"load_balancing": {
"strategy": "round_robin",
"health_check_interval_seconds": 30,
"unhealthy_threshold": 3,
"healthy_threshold": 2,
"sticky_sessions": false,
"session_affinity_ttl_seconds": 3600
},
"caching": {
"enabled": true,
"backend": "redis",
"redis": {
"host": "localhost",
"port": 6379,
"db": 0,
"password": null,
"ssl": false,
"ttl_seconds": 3600,
"max_connections": 50
},
"cache_keys": {
"prompt_prefix": "helion:prompt:",
"result_prefix": "helion:result:",
"metrics_prefix": "helion:metrics:"
},
"cache_policies": {
"identical_prompts": true,
"similar_prompts": false,
"max_cache_size_mb": 1024
}
},
"monitoring": {
"prometheus": {
"enabled": true,
"port": 9090,
"metrics": [
"request_count",
"request_duration_seconds",
"token_generation_rate",
"gpu_memory_usage",
"active_requests",
"queue_size"
]
},
"logging": {
"format": "json",
"output": "stdout",
"level": "INFO",
"include_request_body": false,
"include_response_body": false,
"log_rotation": {
"enabled": true,
"max_size_mb": 100,
"max_files": 10
}
},
"tracing": {
"enabled": false,
"backend": "jaeger",
"sampling_rate": 0.1
}
},
"security": {
"authentication": {
"enabled": true,
"type": "api_key",
"api_key_header": "X-API-Key",
"rate_limiting": true
},
"rate_limiting": {
"enabled": true,
"requests_per_minute": 100,
"requests_per_hour": 5000,
"burst_size": 10,
"strategy": "sliding_window"
},
"input_validation": {
"max_prompt_length": 199000,
"max_image_size_mb": 20,
"max_images_per_request": 10,
"allowed_image_formats": ["jpg", "jpeg", "png", "webp"],
"sanitize_inputs": true
},
"output_filtering": {
"enabled": true,
"pii_detection": true,
"toxicity_filtering": true,
"content_policy_enforcement": true
}
},
"resource_management": {
"gpu": {
"memory_fraction": 0.95,
"allow_growth": true,
"per_process_gpu_memory_fraction": 0.9,
"visible_devices": "0",
"multi_gpu_strategy": "model_parallel"
},
"cpu": {
"num_threads": 8,
"num_workers": 4,
"affinity_enabled": false
},
"memory": {
"max_memory_gb": 64,
"swap_enabled": false,
"oom_handling": "graceful_degradation"
}
},
"auto_scaling": {
"enabled": false,
"min_replicas": 1,
"max_replicas": 10,
"target_gpu_utilization": 0.7,
"target_request_rate": 50,
"scale_up_threshold": 0.8,
"scale_down_threshold": 0.3,
"cooldown_period_seconds": 300
},
"backup_and_recovery": {
"checkpoint_enabled": false,
"checkpoint_interval_hours": 24,
"checkpoint_path": "/data/checkpoints",
"max_checkpoints": 5,
"recovery_strategy": "latest_checkpoint"
},
"experimental_features": {
"speculative_decoding": false,
"continuous_batching": true,
"dynamic_batching": true,
"model_compilation": false,
"mixed_precision": true,
"gradient_checkpointing": false
},
"model_serving_options": {
"triton_inference_server": {
"enabled": false,
"model_repository": "/models",
"backend": "python",
"max_batch_size": 8,
"dynamic_batching": true
},
"torchserve": {
"enabled": false,
"model_store": "/model_store",
"batch_size": 4,
"workers": 2
},
"ray_serve": {
"enabled": false,
"num_replicas": 2,
"max_concurrent_queries": 16
}
},
"cloud_deployment": {
"aws": {
"instance_type": "p4d.24xlarge",
"region": "us-east-1",
"use_spot_instances": false,
"s3_model_path": "s3://your-bucket/models/helion-v2-thinking"
},
"gcp": {
"machine_type": "a2-highgpu-8g",
"region": "us-central1",
"preemptible": false,
"gcs_model_path": "gs://your-bucket/models/helion-v2-thinking"
},
"azure": {
"vm_size": "Standard_NC96ads_A100_v4",
"region": "eastus",
"spot_instance": false,
"blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking"
}
}
}