# FBMC Chronos-2 Zero-Shot Evaluation

**Performance analysis**: Compare 14-day forecasts vs actual flows (Oct 1-14, 2025)

This notebook evaluates zero-shot forecast accuracy against ground truth.

## 1. Environment Setup

In [None]:
import os
import polars as pl
import numpy as np
from datetime import datetime
from datasets import load_dataset
import altair as alt
from pathlib import Path

print("Environment setup complete")

## 2. Load Forecasts and Actuals

In [None]:
# Load forecasts from full inference run
forecast_path = Path('/home/user/app/forecasts_14day.parquet')
if not forecast_path.exists():
 raise FileNotFoundError("Run inference_full_14day.ipynb first to generate forecasts")

forecasts = pl.read_parquet(forecast_path)
print(f"Forecasts loaded: {forecasts.shape}")
print(f" Forecast period: {forecasts['timestamp'].min()} to {forecasts['timestamp'].max()}")

# Load actual values from dataset
hf_token = os.getenv("HF_TOKEN")
dataset = load_dataset(
 "evgueni-p/fbmc-features-24month",
 split="train",
 token=hf_token
)
df = pl.from_arrow(dataset.data.table)

# Extract Oct 1-14 actuals
actuals = df.filter(
 (pl.col('timestamp') >= datetime(2025, 10, 1, 0, 0)) &
 (pl.col('timestamp') <= datetime(2025, 10, 14, 23, 0))
)

# Select only target columns
target_cols = [col for col in actuals.columns if col.startswith('target_border_')]
actuals = actuals.select(['timestamp'] + target_cols)

print(f"Actuals loaded: {actuals.shape}")
print(f" Actual period: {actuals['timestamp'].min()} to {actuals['timestamp'].max()}")

## 3. Calculate Error Metrics

In [None]:
# Align forecasts and actuals
borders = [col.replace('target_border_', '') for col in target_cols]

results = []

for border in borders:
 forecast_col = f'forecast_{border}'
 actual_col = f'target_border_{border}'
 
 if forecast_col not in forecasts.columns:
 print(f"Warning: No forecast for {border}")
 continue
 
 # Get forecast and actual values
 y_pred = forecasts[forecast_col].to_numpy()
 y_true = actuals[actual_col].to_numpy()
 
 # Skip if any nulls
 if np.isnan(y_pred).any() or np.isnan(y_true).any():
 print(f"Warning: Nulls detected for {border}")
 continue
 
 # Calculate metrics
 mae = np.abs(y_pred - y_true).mean()
 rmse = np.sqrt(((y_pred - y_true) ** 2).mean())
 mape = (np.abs((y_true - y_pred) / (y_true + 1e-8)) * 100).mean()
 
 # D+1 metrics (first 24 hours)
 mae_d1 = np.abs(y_pred[:24] - y_true[:24]).mean()
 
 results.append({
 'border': border,
 'mae_14day': mae,
 'mae_d1': mae_d1,
 'rmse_14day': rmse,
 'mape_14day': mape,
 'actual_mean': y_true.mean(),
 'actual_std': y_true.std()
 })

results_df = pl.DataFrame(results).sort('mae_d1')

print(f"\nEvaluation complete for {len(results)} borders")

## 4. Overall Performance Summary

In [None]:
print("="*60)
print("ZERO-SHOT PERFORMANCE SUMMARY")
print("="*60)
print(f"\nD+1 MAE (First 24 hours):")
print(f" Mean: {results_df['mae_d1'].mean():.1f} MW")
print(f" Median: {results_df['mae_d1'].median():.1f} MW")
print(f" Best: {results_df['mae_d1'].min():.1f} MW ({results_df.filter(pl.col('mae_d1') == pl.col('mae_d1').min())['border'][0]})")
print(f" Worst: {results_df['mae_d1'].max():.1f} MW ({results_df.filter(pl.col('mae_d1') == pl.col('mae_d1').max())['border'][0]})")

print(f"\n14-Day MAE (Full horizon):")
print(f" Mean: {results_df['mae_14day'].mean():.1f} MW")
print(f" Median: {results_df['mae_14day'].median():.1f} MW")

print(f"\n14-Day RMSE:")
print(f" Mean: {results_df['rmse_14day'].mean():.1f} MW")
print(f" Median: {results_df['rmse_14day'].median():.1f} MW")

print(f"\n14-Day MAPE:")
print(f" Mean: {results_df['mape_14day'].mean():.1f}%")
print(f" Median: {results_df['mape_14day'].median():.1f}%")

# Target check
target_mae = 150 # MW
borders_meeting_target = results_df.filter(pl.col('mae_d1') <= target_mae)
print(f"\nBorders meeting D+1 MAE target (<= {target_mae} MW):")
print(f" {len(borders_meeting_target)}/{len(results_df)} ({len(borders_meeting_target)/len(results_df)*100:.1f}%)")

print("\n" + "="*60)

## 5. Top 10 Best and Worst Borders

In [None]:
print("Top 10 Best Performers (D+1 MAE):")
print(results_df.head(10).select(['border', 'mae_d1', 'mae_14day', 'rmse_14day']))

print("\nTop 10 Worst Performers (D+1 MAE):")
print(results_df.tail(10).select(['border', 'mae_d1', 'mae_14day', 'rmse_14day']))

## 6. Visualize Performance Distribution

In [None]:
# MAE distribution histogram
mae_hist = alt.Chart(results_df.to_pandas()).mark_bar().encode(
 x=alt.X('mae_d1:Q', bin=alt.Bin(maxbins=20), title='D+1 MAE (MW)'),
 y=alt.Y('count()', title='Number of Borders')
).properties(
 width=600,
 height=300,
 title='D+1 MAE Distribution Across Borders'
)

# Add target line
target_line = alt.Chart(pl.DataFrame({'target': [150]})).mark_rule(color='red', strokeDash=[5, 5]).encode(
 x='target:Q'
)

mae_hist + target_line

## 7. Compare Best vs Worst Border

In [None]:
# Select best and worst border
best_border = results_df.head(1)['border'][0]
worst_border = results_df.tail(1)['border'][0]

# Create comparison charts
charts = []
for border in [best_border, worst_border]:
 # Combine forecast and actual
 viz_data = pl.DataFrame({
 'timestamp': forecasts['timestamp'],
 'Forecast': forecasts[f'forecast_{border}'],
 'Actual': actuals[f'target_border_{border}']
 }).unpivot(index='timestamp', variable_name='type', value_name='flow')
 
 mae = results_df.filter(pl.col('border') == border)['mae_d1'][0]
 
 chart = alt.Chart(viz_data.to_pandas()).mark_line().encode(
 x=alt.X('timestamp:T', title='Date'),
 y=alt.Y('flow:Q', title='Flow (MW)'),
 color='type:N',
 strokeDash='type:N'
 ).properties(
 width=600,
 height=250,
 title=f'{border} (D+1 MAE: {mae:.1f} MW)'
 )
 charts.append(chart)

alt.vconcat(*charts).properties(
 title='Best vs Worst Performing Border'
)

## 8. Export Results

In [None]:
# Save results to CSV
output_path = Path('/home/user/app/evaluation_results.csv')
results_df.write_csv(output_path)

print(f"✓ Results saved to {output_path}")
print(f"\nEvaluation complete!")
print(f" Borders evaluated: {len(results_df)}")
print(f" Mean D+1 MAE: {results_df['mae_d1'].mean():.1f} MW")
print(f" Target (<= 150 MW): {'ACHIEVED' if results_df['mae_d1'].mean() <= 150 else 'NOT MET'}")