{
  "monitoring": {
    "enabled": true,
    "interval_seconds": 15,
    "retention_days": 30
  },
  "metrics": {
    "system": {
      "enabled": true,
      "collect": [
        "cpu_usage",
        "memory_usage",
        "disk_usage",
        "network_io"
      ]
    },
    "gpu": {
      "enabled": true,
      "collect": [
        "gpu_utilization",
        "gpu_memory_used",
        "gpu_memory_total",
        "gpu_temperature",
        "gpu_power_usage"
      ],
      "alert_thresholds": {
        "temperature_celsius": 85,
        "memory_utilization_percent": 95,
        "power_watts": 400
      }
    },
    "model": {
      "enabled": true,
      "collect": [
        "requests_per_second",
        "tokens_per_second",
        "average_latency_ms",
        "p50_latency_ms",
        "p95_latency_ms",
        "p99_latency_ms",
        "error_rate",
        "active_connections",
        "queue_depth"
      ]
    },
    "inference": {
      "enabled": true,
      "collect": [
        "prompt_tokens",
        "completion_tokens",
        "total_tokens",
        "generation_time_ms",
        "preprocessing_time_ms",
        "postprocessing_time_ms"
      ]
    }
  },
  "alerts": {
    "enabled": true,
    "channels": [
      "email",
      "slack",
      "pagerduty"
    ],
    "rules": [
      {
        "name": "high_error_rate",
        "condition": "error_rate > 0.05",
        "duration_seconds": 300,
        "severity": "critical",
        "message": "Error rate exceeded 5% for 5 minutes"
      },
      {
        "name": "high_latency",
        "condition": "p95_latency_ms > 5000",
        "duration_seconds": 180,
        "severity": "warning",
        "message": "P95 latency exceeded 5 seconds"
      },
      {
        "name": "gpu_temperature_high",
        "condition": "gpu_temperature > 85",
        "duration_seconds": 60,
        "severity": "critical",
        "message": "GPU temperature critically high"
      },
      {
        "name": "memory_pressure",
        "condition": "gpu_memory_used / gpu_memory_total > 0.95",
        "duration_seconds": 300,
        "severity": "warning",
        "message": "GPU memory utilization above 95%"
      },
      {
        "name": "low_throughput",
        "condition": "tokens_per_second < 10",
        "duration_seconds": 600,
        "severity": "warning",
        "message": "Throughput below 10 tokens/second"
      }
    ]
  },
  "logging": {
    "level": "INFO",
    "format": "json",
    "outputs": [
      {
        "type": "file",
        "path": "./logs/monitoring.log",
        "rotation": "daily",
        "retention_days": 30
      },
      {
        "type": "stdout",
        "enabled": true
      },
      {
        "type": "elasticsearch",
        "enabled": false,
        "host": "localhost:9200",
        "index": "helion-metrics"
      }
    ]
  },
  "prometheus": {
    "enabled": true,
    "port": 8001,
    "path": "/metrics",
    "namespace": "helion",
    "subsystem": "inference",
    "labels": {
      "model": "Helion-2.5-Rnd",
      "version": "2.5.0-rnd",
      "environment": "production"
    }
  },
  "grafana": {
    "enabled": true,
    "dashboards": [
      {
        "name": "Helion Overview",
        "file": "./monitoring/dashboards/overview.json",
        "refresh": "30s"
      },
      {
        "name": "GPU Metrics",
        "file": "./monitoring/dashboards/gpu.json",
        "refresh": "15s"
      },
      {
        "name": "Inference Performance",
        "file": "./monitoring/dashboards/inference.json",
        "refresh": "30s"
      }
    ]
  },
  "health_checks": {
    "enabled": true,
    "endpoint": "/health",
    "interval_seconds": 30,
    "timeout_seconds": 10,
    "checks": [
      {
        "name": "model_loaded",
        "type": "internal",
        "critical": true
      },
      {
        "name": "gpu_available",
        "type": "internal",
        "critical": true
      },
      {
        "name": "inference_responsive",
        "type": "endpoint",
        "url": "http://localhost:8000/v1/models",
        "critical": false
      }
    ]
  },
  "tracing": {
    "enabled": true,
    "sample_rate": 0.1,
    "exporter": "jaeger",
    "endpoint": "http://localhost:14268/api/traces"
  },
  "profiling": {
    "enabled": false,
    "interval_seconds": 3600,
    "duration_seconds": 300,
    "output_dir": "./profiling"
  }
}