#!/usr/bin/env python3 """ October 2024 Evaluation - Multivariate Chronos-2 Run date: 2024-09-30 (forecast Oct 1-14, 2024) Compares 38-border × 14-day forecast against actuals Calculates D+1 through D+14 MAE for each border """ import sys sys.stdout.reconfigure(encoding='utf-8', errors='replace') import os import time import numpy as np import polars as pl from datetime import datetime, timedelta from pathlib import Path from dotenv import load_dotenv from gradio_client import Client load_dotenv() def main(): print("="*70) print("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION") print("="*70) total_start = time.time() # Step 1: Connect to HF Space print("\n[1/6] Connecting to HuggingFace Space...") hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN not found in environment") client = Client("evgueni-p/fbmc-chronos2", hf_token=hf_token) print("[OK] Connected to HF Space") # Step 2: Run full 14-day forecast for Oct 1-14, 2024 print("\n[2/6] Running full 38-border forecast via HF Space API...") print(" Run date: 2024-09-30") print(" Forecast period: Oct 1-14, 2024 (336 hours)") print(" This may take 5-10 minutes...") forecast_start_time = time.time() result_file = client.predict( "2024-09-30", # run_date "full_14day", # forecast_type ) forecast_time = time.time() - forecast_start_time print(f"[OK] Forecast complete in {forecast_time/60:.2f} minutes") print(f" Result file: {result_file}") # Step 3: Load forecast results print("\n[3/6] Loading forecast results...") forecast_df = pl.read_parquet(result_file) print(f"[OK] Loaded forecast with shape: {forecast_df.shape}") print(f" Columns: {len(forecast_df.columns)} (timestamp + {len(forecast_df.columns)-1} forecast columns)") # Identify border columns (median forecasts) median_cols = [col for col in forecast_df.columns if col.endswith('_median')] borders = [col.replace('_median', '') for col in median_cols] print(f"[OK] Found {len(borders)} borders") # Step 4: Load actuals from local dataset print("\n[4/6] Loading actual values from local dataset...") local_data_path = Path('data/processed/features_unified_24month.parquet') if not local_data_path.exists(): print(f"[ERROR] Local dataset not found at: {local_data_path}") sys.exit(1) df = pl.read_parquet(local_data_path) print(f"[OK] Loaded dataset: {len(df)} rows") print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") # Extract October 1-14, 2024 actuals oct_start = datetime(2024, 10, 1, 0, 0, 0) oct_end = datetime(2024, 10, 14, 23, 0, 0) actual_df = df.filter( (pl.col('timestamp') >= oct_start) & (pl.col('timestamp') <= oct_end) ) if len(actual_df) == 0: print("[ERROR] No actual data found for October 2024!") print(" Dataset may not contain October 2024 data.") print(" Available date range in dataset:") print(f" {df['timestamp'].min()} to {df['timestamp'].max()}") sys.exit(1) print(f"[OK] Extracted {len(actual_df)} hours of actual values") # Step 5: Calculate metrics for each border print(f"\n[5/6] Calculating MAE metrics for {len(borders)} borders...") print(" Progress:") results = [] for i, border in enumerate(borders, 1): # Get forecast for this border (median) forecast_col = f"{border}_median" if forecast_col not in forecast_df.columns: print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no forecast)") continue # Get actual values for this border target_col = f'target_border_{border}' if target_col not in actual_df.columns: print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no actuals)") continue # Merge forecast with actuals on timestamp merged = forecast_df.select(['timestamp', forecast_col]).join( actual_df.select(['timestamp', target_col]), on='timestamp', how='left' ) # Calculate overall MAE (all 336 hours) valid_data = merged.filter( pl.col(forecast_col).is_not_null() & pl.col(target_col).is_not_null() ) if len(valid_data) == 0: print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no valid data)") continue # Validation: Check for expected 336 hours (14 days × 24 hours) if len(valid_data) != 336: print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Only {len(valid_data)}/336 hours (missing {336-len(valid_data)} hours)") # Calculate overall metrics mae = (valid_data[forecast_col] - valid_data[target_col]).abs().mean() rmse = ((valid_data[forecast_col] - valid_data[target_col])**2).mean()**0.5 # Validation: Flag suspicious MAE values if mae < 0.001: print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Suspiciously low MAE = {mae:.2e} MW (possible data leakage)") # Calculate per-day MAE (D+1 through D+14) per_day_mae = [] for day in range(1, 15): day_start = oct_start + timedelta(days=day-1) day_end = day_start + timedelta(days=1) - timedelta(hours=1) day_data = valid_data.filter( (pl.col('timestamp') >= day_start) & (pl.col('timestamp') <= day_end) ) if len(day_data) > 0: day_mae = (day_data[forecast_col] - day_data[target_col]).abs().mean() per_day_mae.append(day_mae) else: per_day_mae.append(np.nan) # Build results dict with all 14 days result_dict = { 'border': border, 'mae_overall': mae, 'rmse_overall': rmse, 'n_hours': len(valid_data), } # Add MAE for each day (D+1 through D+14) for day_idx in range(14): day_num = day_idx + 1 result_dict[f'mae_d{day_num}'] = per_day_mae[day_idx] if len(per_day_mae) > day_idx else np.nan # Validation: Check for constant forecasts (same MAE across all days) valid_mae_values = [m for m in per_day_mae if not np.isnan(m)] if len(valid_mae_values) >= 5: # Need at least 5 days to check mae_std = np.std(valid_mae_values) mae_mean = np.mean(valid_mae_values) if mae_std < 0.01 and mae_mean > 0: # Essentially zero variation but non-zero MAE print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Constant forecast detected (MAE std={mae_std:.2e})") results.append(result_dict) # Status indicator d1_mae = per_day_mae[0] if len(per_day_mae) > 0 else np.inf status = "[OK]" if d1_mae <= 150 else "[!]" print(f" [{i:2d}/{len(borders)}] {border:15s} - D+1 MAE: {d1_mae:6.1f} MW {status}") # Step 6: Summary statistics print("\n[6/6] Generating summary report...") if not results: print("[ERROR] No results to summarize") sys.exit(1) results_df = pl.DataFrame(results) # Calculate summary statistics mean_mae_d1 = results_df['mae_d1'].mean() median_mae_d1 = results_df['mae_d1'].median() min_mae_d1 = results_df['mae_d1'].min() max_mae_d1 = results_df['mae_d1'].max() # Save results to CSV output_file = Path('results/october_2024_multivariate.csv') output_file.parent.mkdir(exist_ok=True) results_df.write_csv(output_file) print(f"[OK] Results saved to: {output_file}") # Generate summary report print("\n" + "="*70) print("EVALUATION RESULTS SUMMARY - OCTOBER 2024") print("="*70) print(f"\nBorders evaluated: {len(results)}/{len(borders)}") print(f"Total forecast time: {forecast_time/60:.2f} minutes") print(f"Total evaluation time: {(time.time() - total_start)/60:.2f} minutes") print(f"\n*** D+1 MAE (PRIMARY METRIC) ***") print(f"Mean: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)") print(f"Median: {median_mae_d1:.2f} MW") print(f"Min: {min_mae_d1:.2f} MW") print(f"Max: {max_mae_d1:.2f} MW") # Target achievement below_target = (results_df['mae_d1'] <= 150).sum() print(f"\n*** TARGET ACHIEVEMENT ***") print(f"Borders with D+1 MAE [<=]150 MW: {below_target}/{len(results)} ({below_target/len(results)*100:.1f}%)") # Best and worst performers print(f"\n*** TOP 5 BEST PERFORMERS (Lowest D+1 MAE) ***") best = results_df.sort('mae_d1').head(5) for row in best.iter_rows(named=True): print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW") print(f"\n*** TOP 5 WORST PERFORMERS (Highest D+1 MAE) ***") worst = results_df.sort('mae_d1', descending=True).head(5) for row in worst.iter_rows(named=True): print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW") # MAE degradation over forecast horizon print(f"\n*** MAE DEGRADATION OVER FORECAST HORIZON (ALL 14 DAYS) ***") for day in range(1, 15): col_name = f'mae_d{day}' mean_mae_day = results_df[col_name].mean() delta = mean_mae_day - mean_mae_d1 if day > 1 else 0 delta_pct = (delta / mean_mae_d1 * 100) if day > 1 and mean_mae_d1 > 0 else 0 if day == 1: print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (baseline)") else: print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (+{delta:5.2f} MW, +{delta_pct:5.1f}%)") # Final verdict print("\n" + "="*70) if mean_mae_d1 <= 134: print("[OK] TARGET ACHIEVED! Mean D+1 MAE [<=]134 MW") print(" Zero-shot multivariate forecasting successful!") elif mean_mae_d1 <= 150: print("[~] CLOSE TO TARGET. Mean D+1 MAE [<=]150 MW") print(" Zero-shot baseline established. Fine-tuning recommended.") else: print(f"[!] TARGET NOT MET. Mean D+1 MAE: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)") print(" Fine-tuning strongly recommended for Phase 2") print("="*70) # Save summary report report_file = Path('results/october_2024_evaluation_report.txt') with open(report_file, 'w', encoding='utf-8', errors='replace') as f: f.write("="*70 + "\n") f.write("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION REPORT\n") f.write("="*70 + "\n\n") f.write(f"Run date: 2024-09-30\n") f.write(f"Forecast period: Oct 1-14, 2024 (336 hours)\n") f.write(f"Model: amazon/chronos-2 (multivariate, 615 features)\n") f.write(f"Borders evaluated: {len(results)}/{len(borders)}\n") f.write(f"Forecast time: {forecast_time/60:.2f} minutes\n\n") f.write(f"D+1 MAE RESULTS:\n") f.write(f" Mean: {mean_mae_d1:.2f} MW\n") f.write(f" Median: {median_mae_d1:.2f} MW\n") f.write(f" Min: {min_mae_d1:.2f} MW\n") f.write(f" Max: {max_mae_d1:.2f} MW\n\n") f.write(f"Target achievement: {below_target}/{len(results)} borders with MAE [<=]150 MW\n\n") if mean_mae_d1 <= 134: f.write("[OK] TARGET ACHIEVED!\n") else: f.write(f"[!] Target not met - Fine-tuning recommended\n") print(f"\n[OK] Summary report saved to: {report_file}") print(f"\nTotal evaluation time: {(time.time() - total_start)/60:.1f} minutes") if __name__ == '__main__': main()