{ "monitoring": { "enabled": true, "interval_seconds": 15, "retention_days": 30 }, "metrics": { "system": { "enabled": true, "collect": [ "cpu_usage", "memory_usage", "disk_usage", "network_io" ] }, "gpu": { "enabled": true, "collect": [ "gpu_utilization", "gpu_memory_used", "gpu_memory_total", "gpu_temperature", "gpu_power_usage" ], "alert_thresholds": { "temperature_celsius": 85, "memory_utilization_percent": 95, "power_watts": 400 } }, "model": { "enabled": true, "collect": [ "requests_per_second", "tokens_per_second", "average_latency_ms", "p50_latency_ms", "p95_latency_ms", "p99_latency_ms", "error_rate", "active_connections", "queue_depth" ] }, "inference": { "enabled": true, "collect": [ "prompt_tokens", "completion_tokens", "total_tokens", "generation_time_ms", "preprocessing_time_ms", "postprocessing_time_ms" ] } }, "alerts": { "enabled": true, "channels": [ "email", "slack", "pagerduty" ], "rules": [ { "name": "high_error_rate", "condition": "error_rate > 0.05", "duration_seconds": 300, "severity": "critical", "message": "Error rate exceeded 5% for 5 minutes" }, { "name": "high_latency", "condition": "p95_latency_ms > 5000", "duration_seconds": 180, "severity": "warning", "message": "P95 latency exceeded 5 seconds" }, { "name": "gpu_temperature_high", "condition": "gpu_temperature > 85", "duration_seconds": 60, "severity": "critical", "message": "GPU temperature critically high" }, { "name": "memory_pressure", "condition": "gpu_memory_used / gpu_memory_total > 0.95", "duration_seconds": 300, "severity": "warning", "message": "GPU memory utilization above 95%" }, { "name": "low_throughput", "condition": "tokens_per_second < 10", "duration_seconds": 600, "severity": "warning", "message": "Throughput below 10 tokens/second" } ] }, "logging": { "level": "INFO", "format": "json", "outputs": [ { "type": "file", "path": "./logs/monitoring.log", "rotation": "daily", "retention_days": 30 }, { "type": "stdout", "enabled": true }, { "type": "elasticsearch", "enabled": false, "host": "localhost:9200", "index": "helion-metrics" } ] }, "prometheus": { "enabled": true, "port": 8001, "path": "/metrics", "namespace": "helion", "subsystem": "inference", "labels": { "model": "Helion-2.5-Rnd", "version": "2.5.0-rnd", "environment": "production" } }, "grafana": { "enabled": true, "dashboards": [ { "name": "Helion Overview", "file": "./monitoring/dashboards/overview.json", "refresh": "30s" }, { "name": "GPU Metrics", "file": "./monitoring/dashboards/gpu.json", "refresh": "15s" }, { "name": "Inference Performance", "file": "./monitoring/dashboards/inference.json", "refresh": "30s" } ] }, "health_checks": { "enabled": true, "endpoint": "/health", "interval_seconds": 30, "timeout_seconds": 10, "checks": [ { "name": "model_loaded", "type": "internal", "critical": true }, { "name": "gpu_available", "type": "internal", "critical": true }, { "name": "inference_responsive", "type": "endpoint", "url": "http://localhost:8000/v1/models", "critical": false } ] }, "tracing": { "enabled": true, "sample_rate": 0.1, "exporter": "jaeger", "endpoint": "http://localhost:14268/api/traces" }, "profiling": { "enabled": false, "interval_seconds": 3600, "duration_seconds": 300, "output_dir": "./profiling" } }