{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# FBMC Chronos-2 Zero-Shot Evaluation\n", "\n", "**Performance analysis**: Compare 14-day forecasts vs actual flows (Oct 1-14, 2025)\n", "\n", "This notebook evaluates zero-shot forecast accuracy against ground truth." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Environment Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import polars as pl\n", "import numpy as np\n", "from datetime import datetime\n", "from datasets import load_dataset\n", "import altair as alt\n", "from pathlib import Path\n", "\n", "print(\"Environment setup complete\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Forecasts and Actuals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load forecasts from full inference run\n", "forecast_path = Path('/home/user/app/forecasts_14day.parquet')\n", "if not forecast_path.exists():\n", " raise FileNotFoundError(\"Run inference_full_14day.ipynb first to generate forecasts\")\n", "\n", "forecasts = pl.read_parquet(forecast_path)\n", "print(f\"Forecasts loaded: {forecasts.shape}\")\n", "print(f\" Forecast period: {forecasts['timestamp'].min()} to {forecasts['timestamp'].max()}\")\n", "\n", "# Load actual values from dataset\n", "hf_token = os.getenv(\"HF_TOKEN\")\n", "dataset = load_dataset(\n", " \"evgueni-p/fbmc-features-24month\",\n", " split=\"train\",\n", " token=hf_token\n", ")\n", "df = pl.from_arrow(dataset.data.table)\n", "\n", "# Extract Oct 1-14 actuals\n", "actuals = df.filter(\n", " (pl.col('timestamp') >= datetime(2025, 10, 1, 0, 0)) &\n", " (pl.col('timestamp') <= datetime(2025, 10, 14, 23, 0))\n", ")\n", "\n", "# Select only target columns\n", "target_cols = [col for col in actuals.columns if col.startswith('target_border_')]\n", "actuals = actuals.select(['timestamp'] + target_cols)\n", "\n", "print(f\"Actuals loaded: {actuals.shape}\")\n", "print(f\" Actual period: {actuals['timestamp'].min()} to {actuals['timestamp'].max()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Calculate Error Metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Align forecasts and actuals\n", "borders = [col.replace('target_border_', '') for col in target_cols]\n", "\n", "results = []\n", "\n", "for border in borders:\n", " forecast_col = f'forecast_{border}'\n", " actual_col = f'target_border_{border}'\n", " \n", " if forecast_col not in forecasts.columns:\n", " print(f\"Warning: No forecast for {border}\")\n", " continue\n", " \n", " # Get forecast and actual values\n", " y_pred = forecasts[forecast_col].to_numpy()\n", " y_true = actuals[actual_col].to_numpy()\n", " \n", " # Skip if any nulls\n", " if np.isnan(y_pred).any() or np.isnan(y_true).any():\n", " print(f\"Warning: Nulls detected for {border}\")\n", " continue\n", " \n", " # Calculate metrics\n", " mae = np.abs(y_pred - y_true).mean()\n", " rmse = np.sqrt(((y_pred - y_true) ** 2).mean())\n", " mape = (np.abs((y_true - y_pred) / (y_true + 1e-8)) * 100).mean()\n", " \n", " # D+1 metrics (first 24 hours)\n", " mae_d1 = np.abs(y_pred[:24] - y_true[:24]).mean()\n", " \n", " results.append({\n", " 'border': border,\n", " 'mae_14day': mae,\n", " 'mae_d1': mae_d1,\n", " 'rmse_14day': rmse,\n", " 'mape_14day': mape,\n", " 'actual_mean': y_true.mean(),\n", " 'actual_std': y_true.std()\n", " })\n", "\n", "results_df = pl.DataFrame(results).sort('mae_d1')\n", "\n", "print(f\"\\nEvaluation complete for {len(results)} borders\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Overall Performance Summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=\"*60)\n", "print(\"ZERO-SHOT PERFORMANCE SUMMARY\")\n", "print(\"=\"*60)\n", "print(f\"\\nD+1 MAE (First 24 hours):\")\n", "print(f\" Mean: {results_df['mae_d1'].mean():.1f} MW\")\n", "print(f\" Median: {results_df['mae_d1'].median():.1f} MW\")\n", "print(f\" Best: {results_df['mae_d1'].min():.1f} MW ({results_df.filter(pl.col('mae_d1') == pl.col('mae_d1').min())['border'][0]})\")\n", "print(f\" Worst: {results_df['mae_d1'].max():.1f} MW ({results_df.filter(pl.col('mae_d1') == pl.col('mae_d1').max())['border'][0]})\")\n", "\n", "print(f\"\\n14-Day MAE (Full horizon):\")\n", "print(f\" Mean: {results_df['mae_14day'].mean():.1f} MW\")\n", "print(f\" Median: {results_df['mae_14day'].median():.1f} MW\")\n", "\n", "print(f\"\\n14-Day RMSE:\")\n", "print(f\" Mean: {results_df['rmse_14day'].mean():.1f} MW\")\n", "print(f\" Median: {results_df['rmse_14day'].median():.1f} MW\")\n", "\n", "print(f\"\\n14-Day MAPE:\")\n", "print(f\" Mean: {results_df['mape_14day'].mean():.1f}%\")\n", "print(f\" Median: {results_df['mape_14day'].median():.1f}%\")\n", "\n", "# Target check\n", "target_mae = 150 # MW\n", "borders_meeting_target = results_df.filter(pl.col('mae_d1') <= target_mae)\n", "print(f\"\\nBorders meeting D+1 MAE target (<= {target_mae} MW):\")\n", "print(f\" {len(borders_meeting_target)}/{len(results_df)} ({len(borders_meeting_target)/len(results_df)*100:.1f}%)\")\n", "\n", "print(\"\\n\" + \"=\"*60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Top 10 Best and Worst Borders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Top 10 Best Performers (D+1 MAE):\")\n", "print(results_df.head(10).select(['border', 'mae_d1', 'mae_14day', 'rmse_14day']))\n", "\n", "print(\"\\nTop 10 Worst Performers (D+1 MAE):\")\n", "print(results_df.tail(10).select(['border', 'mae_d1', 'mae_14day', 'rmse_14day']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Visualize Performance Distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# MAE distribution histogram\n", "mae_hist = alt.Chart(results_df.to_pandas()).mark_bar().encode(\n", " x=alt.X('mae_d1:Q', bin=alt.Bin(maxbins=20), title='D+1 MAE (MW)'),\n", " y=alt.Y('count()', title='Number of Borders')\n", ").properties(\n", " width=600,\n", " height=300,\n", " title='D+1 MAE Distribution Across Borders'\n", ")\n", "\n", "# Add target line\n", "target_line = alt.Chart(pl.DataFrame({'target': [150]})).mark_rule(color='red', strokeDash=[5, 5]).encode(\n", " x='target:Q'\n", ")\n", "\n", "mae_hist + target_line" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Compare Best vs Worst Border" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Select best and worst border\n", "best_border = results_df.head(1)['border'][0]\n", "worst_border = results_df.tail(1)['border'][0]\n", "\n", "# Create comparison charts\n", "charts = []\n", "for border in [best_border, worst_border]:\n", " # Combine forecast and actual\n", " viz_data = pl.DataFrame({\n", " 'timestamp': forecasts['timestamp'],\n", " 'Forecast': forecasts[f'forecast_{border}'],\n", " 'Actual': actuals[f'target_border_{border}']\n", " }).unpivot(index='timestamp', variable_name='type', value_name='flow')\n", " \n", " mae = results_df.filter(pl.col('border') == border)['mae_d1'][0]\n", " \n", " chart = alt.Chart(viz_data.to_pandas()).mark_line().encode(\n", " x=alt.X('timestamp:T', title='Date'),\n", " y=alt.Y('flow:Q', title='Flow (MW)'),\n", " color='type:N',\n", " strokeDash='type:N'\n", " ).properties(\n", " width=600,\n", " height=250,\n", " title=f'{border} (D+1 MAE: {mae:.1f} MW)'\n", " )\n", " charts.append(chart)\n", "\n", "alt.vconcat(*charts).properties(\n", " title='Best vs Worst Performing Border'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. Export Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save results to CSV\n", "output_path = Path('/home/user/app/evaluation_results.csv')\n", "results_df.write_csv(output_path)\n", "\n", "print(f\"✓ Results saved to {output_path}\")\n", "print(f\"\\nEvaluation complete!\")\n", "print(f\" Borders evaluated: {len(results_df)}\")\n", "print(f\" Mean D+1 MAE: {results_df['mae_d1'].mean():.1f} MW\")\n", "print(f\" Target (<= 150 MW): {'ACHIEVED' if results_df['mae_d1'].mean() <= 150 else 'NOT MET'}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }