Helion-V2.0-Thinking / deployment_config.json

Create deployment_config.json (#3)

abae9ca verified about 1 month ago

8.33 kB

	{
	"deployment_metadata": {
	"model_name": "Helion-V2.0-Thinking",
	"version": "2.0.0",
	"deployment_date": "2024-11-27",
	"supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"],
	"minimum_transformers_version": "4.36.0"
	},

	"server_configurations": {
	"development": {
	"environment": "dev",
	"host": "0.0.0.0",
	"port": 8000,
	"workers": 1,
	"max_batch_size": 1,
	"max_concurrent_requests": 4,
	"timeout_seconds": 300,
	"enable_cors": true,
	"cors_origins": ["*"],
	"log_level": "DEBUG",
	"cache_enabled": true,
	"metrics_enabled": true
	},
	"production": {
	"environment": "prod",
	"host": "0.0.0.0",
	"port": 8000,
	"workers": 4,
	"max_batch_size": 8,
	"max_concurrent_requests": 32,
	"timeout_seconds": 180,
	"enable_cors": true,
	"cors_origins": ["https://yourdomain.com"],
	"log_level": "INFO",
	"cache_enabled": true,
	"metrics_enabled": true,
	"health_check_enabled": true,
	"auto_scaling": true
	}
	},

	"vllm_config": {
	"gpu_memory_utilization": 0.9,
	"max_num_seqs": 256,
	"max_num_batched_tokens": 8192,
	"max_model_len": 200000,
	"trust_remote_code": true,
	"tensor_parallel_size": 1,
	"pipeline_parallel_size": 1,
	"dtype": "bfloat16",
	"quantization": null,
	"enforce_eager": false,
	"enable_chunked_prefill": true,
	"max_num_on_the_fly": 8,
	"enable_prefix_caching": true,
	"disable_custom_all_reduce": false
	},

	"text_generation_inference": {
	"max_concurrent_requests": 128,
	"max_best_of": 4,
	"max_stop_sequences": 4,
	"max_input_length": 199000,
	"max_total_tokens": 200000,
	"waiting_served_ratio": 1.2,
	"max_batch_prefill_tokens": 4096,
	"max_batch_total_tokens": 200000,
	"max_waiting_tokens": 20,
	"hostname": "0.0.0.0",
	"port": 8080,
	"master_shard_uds_path": "/tmp/text-generation-server",
	"tokenizer_name": "DeepXR/Helion-V2.0-Thinking",
	"revision": "main",
	"validation_workers": 2,
	"json_output": false,
	"otlp_endpoint": null,
	"cors_allow_origin": "*",
	"watermark_gamma": null,
	"watermark_delta": null
	},

	"ollama_modelfile": {
	"from": "DeepXR/Helion-V2.0-Thinking",
	"template": "[INST] {{ .System }} {{ .Prompt }} [/INST]",
	"parameter": {
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 50,
	"num_ctx": 200000,
	"num_predict": 2048,
	"stop": ["</s>", "<\|end\|>"],
	"repeat_penalty": 1.1,
	"seed": -1
	},
	"system": "You are Helion, a helpful AI assistant with vision and tool use capabilities."
	},

	"api_endpoints": {
	"generate": {
	"path": "/v1/generate",
	"method": "POST",
	"rate_limit": "100/minute",
	"request_schema": {
	"prompt": "string (required)",
	"max_tokens": "integer (optional, default: 1024)",
	"temperature": "float (optional, default: 0.7)",
	"top_p": "float (optional, default: 0.9)",
	"stream": "boolean (optional, default: false)",
	"images": "array<base64> (optional)"
	}
	},
	"chat": {
	"path": "/v1/chat/completions",
	"method": "POST",
	"rate_limit": "100/minute",
	"openai_compatible": true,
	"request_schema": {
	"messages": "array (required)",
	"model": "string (required)",
	"temperature": "float (optional)",
	"stream": "boolean (optional)"
	}
	},
	"embeddings": {
	"path": "/v1/embeddings",
	"method": "POST",
	"rate_limit": "200/minute",
	"enabled": false
	},
	"health": {
	"path": "/health",
	"method": "GET",
	"public": true
	},
	"metrics": {
	"path": "/metrics",
	"method": "GET",
	"format": "prometheus",
	"public": false
	}
	},

	"load_balancing": {
	"strategy": "round_robin",
	"health_check_interval_seconds": 30,
	"unhealthy_threshold": 3,
	"healthy_threshold": 2,
	"sticky_sessions": false,
	"session_affinity_ttl_seconds": 3600
	},

	"caching": {
	"enabled": true,
	"backend": "redis",
	"redis": {
	"host": "localhost",
	"port": 6379,
	"db": 0,
	"password": null,
	"ssl": false,
	"ttl_seconds": 3600,
	"max_connections": 50
	},
	"cache_keys": {
	"prompt_prefix": "helion:prompt:",
	"result_prefix": "helion:result:",
	"metrics_prefix": "helion:metrics:"
	},
	"cache_policies": {
	"identical_prompts": true,
	"similar_prompts": false,
	"max_cache_size_mb": 1024
	}
	},

	"monitoring": {
	"prometheus": {
	"enabled": true,
	"port": 9090,
	"metrics": [
	"request_count",
	"request_duration_seconds",
	"token_generation_rate",
	"gpu_memory_usage",
	"active_requests",
	"queue_size"
	]
	},
	"logging": {
	"format": "json",
	"output": "stdout",
	"level": "INFO",
	"include_request_body": false,
	"include_response_body": false,
	"log_rotation": {
	"enabled": true,
	"max_size_mb": 100,
	"max_files": 10
	}
	},
	"tracing": {
	"enabled": false,
	"backend": "jaeger",
	"sampling_rate": 0.1
	}
	},

	"security": {
	"authentication": {
	"enabled": true,
	"type": "api_key",
	"api_key_header": "X-API-Key",
	"rate_limiting": true
	},
	"rate_limiting": {
	"enabled": true,
	"requests_per_minute": 100,
	"requests_per_hour": 5000,
	"burst_size": 10,
	"strategy": "sliding_window"
	},
	"input_validation": {
	"max_prompt_length": 199000,
	"max_image_size_mb": 20,
	"max_images_per_request": 10,
	"allowed_image_formats": ["jpg", "jpeg", "png", "webp"],
	"sanitize_inputs": true
	},
	"output_filtering": {
	"enabled": true,
	"pii_detection": true,
	"toxicity_filtering": true,
	"content_policy_enforcement": true
	}
	},

	"resource_management": {
	"gpu": {
	"memory_fraction": 0.95,
	"allow_growth": true,
	"per_process_gpu_memory_fraction": 0.9,
	"visible_devices": "0",
	"multi_gpu_strategy": "model_parallel"
	},
	"cpu": {
	"num_threads": 8,
	"num_workers": 4,
	"affinity_enabled": false
	},
	"memory": {
	"max_memory_gb": 64,
	"swap_enabled": false,
	"oom_handling": "graceful_degradation"
	}
	},

	"auto_scaling": {
	"enabled": false,
	"min_replicas": 1,
	"max_replicas": 10,
	"target_gpu_utilization": 0.7,
	"target_request_rate": 50,
	"scale_up_threshold": 0.8,
	"scale_down_threshold": 0.3,
	"cooldown_period_seconds": 300
	},

	"backup_and_recovery": {
	"checkpoint_enabled": false,
	"checkpoint_interval_hours": 24,
	"checkpoint_path": "/data/checkpoints",
	"max_checkpoints": 5,
	"recovery_strategy": "latest_checkpoint"
	},

	"experimental_features": {
	"speculative_decoding": false,
	"continuous_batching": true,
	"dynamic_batching": true,
	"model_compilation": false,
	"mixed_precision": true,
	"gradient_checkpointing": false
	},

	"model_serving_options": {
	"triton_inference_server": {
	"enabled": false,
	"model_repository": "/models",
	"backend": "python",
	"max_batch_size": 8,
	"dynamic_batching": true
	},
	"torchserve": {
	"enabled": false,
	"model_store": "/model_store",
	"batch_size": 4,
	"workers": 2
	},
	"ray_serve": {
	"enabled": false,
	"num_replicas": 2,
	"max_concurrent_queries": 16
	}
	},

	"cloud_deployment": {
	"aws": {
	"instance_type": "p4d.24xlarge",
	"region": "us-east-1",
	"use_spot_instances": false,
	"s3_model_path": "s3://your-bucket/models/helion-v2-thinking"
	},
	"gcp": {
	"machine_type": "a2-highgpu-8g",
	"region": "us-central1",
	"preemptible": false,
	"gcs_model_path": "gs://your-bucket/models/helion-v2-thinking"
	},
	"azure": {
	"vm_size": "Standard_NC96ads_A100_v4",
	"region": "eastus",
	"spot_instance": false,
	"blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking"
	}
	}
	}