Helion-V2.5-Rnd / monitoring_config.json
Trouter-Library's picture
Create monitoring_config.json
b7062e4 verified
raw
history blame
4.38 kB
{
"monitoring": {
"enabled": true,
"interval_seconds": 15,
"retention_days": 30
},
"metrics": {
"system": {
"enabled": true,
"collect": [
"cpu_usage",
"memory_usage",
"disk_usage",
"network_io"
]
},
"gpu": {
"enabled": true,
"collect": [
"gpu_utilization",
"gpu_memory_used",
"gpu_memory_total",
"gpu_temperature",
"gpu_power_usage"
],
"alert_thresholds": {
"temperature_celsius": 85,
"memory_utilization_percent": 95,
"power_watts": 400
}
},
"model": {
"enabled": true,
"collect": [
"requests_per_second",
"tokens_per_second",
"average_latency_ms",
"p50_latency_ms",
"p95_latency_ms",
"p99_latency_ms",
"error_rate",
"active_connections",
"queue_depth"
]
},
"inference": {
"enabled": true,
"collect": [
"prompt_tokens",
"completion_tokens",
"total_tokens",
"generation_time_ms",
"preprocessing_time_ms",
"postprocessing_time_ms"
]
}
},
"alerts": {
"enabled": true,
"channels": [
"email",
"slack",
"pagerduty"
],
"rules": [
{
"name": "high_error_rate",
"condition": "error_rate > 0.05",
"duration_seconds": 300,
"severity": "critical",
"message": "Error rate exceeded 5% for 5 minutes"
},
{
"name": "high_latency",
"condition": "p95_latency_ms > 5000",
"duration_seconds": 180,
"severity": "warning",
"message": "P95 latency exceeded 5 seconds"
},
{
"name": "gpu_temperature_high",
"condition": "gpu_temperature > 85",
"duration_seconds": 60,
"severity": "critical",
"message": "GPU temperature critically high"
},
{
"name": "memory_pressure",
"condition": "gpu_memory_used / gpu_memory_total > 0.95",
"duration_seconds": 300,
"severity": "warning",
"message": "GPU memory utilization above 95%"
},
{
"name": "low_throughput",
"condition": "tokens_per_second < 10",
"duration_seconds": 600,
"severity": "warning",
"message": "Throughput below 10 tokens/second"
}
]
},
"logging": {
"level": "INFO",
"format": "json",
"outputs": [
{
"type": "file",
"path": "./logs/monitoring.log",
"rotation": "daily",
"retention_days": 30
},
{
"type": "stdout",
"enabled": true
},
{
"type": "elasticsearch",
"enabled": false,
"host": "localhost:9200",
"index": "helion-metrics"
}
]
},
"prometheus": {
"enabled": true,
"port": 8001,
"path": "/metrics",
"namespace": "helion",
"subsystem": "inference",
"labels": {
"model": "Helion-2.5-Rnd",
"version": "2.5.0-rnd",
"environment": "production"
}
},
"grafana": {
"enabled": true,
"dashboards": [
{
"name": "Helion Overview",
"file": "./monitoring/dashboards/overview.json",
"refresh": "30s"
},
{
"name": "GPU Metrics",
"file": "./monitoring/dashboards/gpu.json",
"refresh": "15s"
},
{
"name": "Inference Performance",
"file": "./monitoring/dashboards/inference.json",
"refresh": "30s"
}
]
},
"health_checks": {
"enabled": true,
"endpoint": "/health",
"interval_seconds": 30,
"timeout_seconds": 10,
"checks": [
{
"name": "model_loaded",
"type": "internal",
"critical": true
},
{
"name": "gpu_available",
"type": "internal",
"critical": true
},
{
"name": "inference_responsive",
"type": "endpoint",
"url": "http://localhost:8000/v1/models",
"critical": false
}
]
},
"tracing": {
"enabled": true,
"sample_rate": 0.1,
"exporter": "jaeger",
"endpoint": "http://localhost:14268/api/traces"
},
"profiling": {
"enabled": false,
"interval_seconds": 3600,
"duration_seconds": 300,
"output_dir": "./profiling"
}
}