Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Holdout Evaluation of Chronos 2 Zero-Shot Forecasts | |
| Forecasts Sept 1-14, 2025 using context up to Aug 31, 2025 | |
| Compares against actual values to calculate MAE, RMSE, MAPE | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import polars as pl | |
| from datetime import datetime, timedelta | |
| from chronos import Chronos2Pipeline | |
| import torch | |
| import time | |
| import os | |
| def main(): | |
| print("="*60) | |
| print("CHRONOS 2 ZERO-SHOT EVALUATION") | |
| print("="*60) | |
| total_start = time.time() | |
| # Step 1: Load dataset | |
| print("\n[1/6] Loading dataset from local cache...") | |
| start_time = time.time() | |
| from datasets import load_dataset | |
| # Load HF token from environment | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise ValueError("HF_TOKEN not found in environment. Please set HF_TOKEN.") | |
| dataset = load_dataset( | |
| "evgueni-p/fbmc-features-24month", | |
| split="train", | |
| token=hf_token | |
| ) | |
| df = pl.from_pandas(dataset.to_pandas()) | |
| # Ensure timestamp is datetime | |
| if df['timestamp'].dtype == pl.String: | |
| df = df.with_columns(pl.col('timestamp').str.to_datetime()) | |
| elif df['timestamp'].dtype != pl.Datetime: | |
| df = df.with_columns(pl.col('timestamp').cast(pl.Datetime)) | |
| print(f"[OK] Loaded {len(df)} rows, {len(df.columns)} columns") | |
| print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") | |
| print(f" Load time: {time.time() - start_time:.1f}s") | |
| # Step 2: Identify target borders | |
| print("\n[2/6] Identifying target borders...") | |
| target_cols = [col for col in df.columns if col.startswith('target_border_')] | |
| borders = [col.replace('target_border_', '') for col in target_cols] | |
| print(f"[OK] Found {len(borders)} borders") | |
| # Step 3: Define evaluation period | |
| print("\n[3/6] Setting up holdout evaluation...") | |
| # Holdout: Forecast Sept 1-14, 2025 using context up to Aug 31, 2025 | |
| holdout_end = datetime(2025, 8, 31, 23, 0, 0) # Last context timestamp | |
| forecast_start = datetime(2025, 9, 1, 0, 0, 0) # First forecast timestamp | |
| forecast_end = datetime(2025, 9, 14, 23, 0, 0) # Last forecast timestamp | |
| context_hours = 512 | |
| prediction_hours = 336 # 14 days | |
| print(f" Holdout evaluation period:") | |
| print(f" Context: up to {holdout_end}") | |
| print(f" Forecast: {forecast_start} to {forecast_end} (14 days)") | |
| print(f" Context window: {context_hours} hours") | |
| # Step 4: Extract actual values for evaluation | |
| print("\n[4/6] Extracting actual values for evaluation period...") | |
| actual_df = df.filter( | |
| (pl.col('timestamp') >= forecast_start) & | |
| (pl.col('timestamp') <= forecast_end) | |
| ) | |
| print(f"[OK] Extracted {len(actual_df)} hours of actual values") | |
| # Step 5: Load model | |
| print("\n[5/6] Loading Chronos 2 model...") | |
| model_start = time.time() | |
| # Note: Running locally, will use CPU if CUDA not available | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| print(f" Using device: {device}") | |
| pipeline = Chronos2Pipeline.from_pretrained( | |
| 'amazon/chronos-2', | |
| device_map=device, | |
| dtype=torch.float32 if device == 'cuda' else torch.float32 | |
| ) | |
| model_time = time.time() - model_start | |
| print(f"[OK] Model loaded in {model_time:.1f}s") | |
| # Step 6: Run inference for all borders and calculate metrics | |
| print(f"\n[6/6] Running holdout evaluation for {len(borders)} borders...") | |
| print(f" Progress:") | |
| results = [] | |
| inference_times = [] | |
| for i, border in enumerate(borders, 1): | |
| border_start = time.time() | |
| # Get context data (up to Aug 31, 2025) | |
| context_start = holdout_end - timedelta(hours=context_hours - 1) | |
| context_df = df.filter( | |
| (pl.col('timestamp') >= context_start) & | |
| (pl.col('timestamp') <= holdout_end) | |
| ) | |
| # Prepare context DataFrame | |
| target_col = f'target_border_{border}' | |
| context_data = context_df.select([ | |
| 'timestamp', | |
| pl.lit(border).alias('border'), | |
| pl.col(target_col).alias('target') | |
| ]).to_pandas() | |
| # Prepare future data | |
| future_timestamps = pd.date_range( | |
| start=forecast_start, | |
| periods=prediction_hours, | |
| freq='h' | |
| ) | |
| future_data = pd.DataFrame({ | |
| 'timestamp': future_timestamps, | |
| 'border': [border] * prediction_hours, | |
| 'target': [np.nan] * prediction_hours | |
| }) | |
| # Combine and predict | |
| combined_df = pd.concat([context_data, future_data], ignore_index=True) | |
| try: | |
| forecasts = pipeline.predict_df( | |
| df=combined_df, | |
| prediction_length=prediction_hours, | |
| id_column='border', | |
| timestamp_column='timestamp', | |
| target='target' | |
| ) | |
| # Get actual values for this border | |
| actual_values = actual_df.select([ | |
| 'timestamp', | |
| pl.col(target_col).alias('actual') | |
| ]).to_pandas() | |
| # Merge forecasts with actuals | |
| merged = forecasts.merge(actual_values, on='timestamp', how='left') | |
| # Calculate metrics using median (0.5 quantile) as point forecast | |
| if '0.5' in merged.columns and 'actual' in merged.columns: | |
| # Remove any rows with missing values | |
| valid_data = merged[['0.5', 'actual']].dropna() | |
| if len(valid_data) > 0: | |
| mae = np.mean(np.abs(valid_data['0.5'] - valid_data['actual'])) | |
| rmse = np.sqrt(np.mean((valid_data['0.5'] - valid_data['actual'])**2)) | |
| mape = np.mean(np.abs((valid_data['0.5'] - valid_data['actual']) / (valid_data['actual'] + 1e-10))) * 100 | |
| results.append({ | |
| 'border': border, | |
| 'mae': mae, | |
| 'rmse': rmse, | |
| 'mape': mape, | |
| 'n_points': len(valid_data), | |
| 'inference_time': time.time() - border_start | |
| }) | |
| inference_times.append(time.time() - border_start) | |
| status = "[OK]" if mae <= 150 else "[!]" # Target: <150 MW | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - MAE: {mae:6.1f} MW {status}") | |
| else: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no valid data)") | |
| else: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - FAILED (missing columns)") | |
| except Exception as e: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - ERROR: {e}") | |
| inference_time = time.time() - model_start - model_time | |
| # Step 7: Calculate and display summary statistics | |
| print("\n" + "="*60) | |
| print("EVALUATION RESULTS SUMMARY") | |
| print("="*60) | |
| if results: | |
| results_df = pd.DataFrame(results) | |
| print(f"\nBorders evaluated: {len(results)}/{len(borders)}") | |
| print(f"Total inference time: {inference_time:.1f}s ({inference_time / 60:.2f} min)") | |
| print(f"Average per border: {np.mean(inference_times):.2f}s") | |
| print(f"\n*** OVERALL METRICS ***") | |
| print(f"Mean MAE: {results_df['mae'].mean():.2f} MW (Target: ≤134 MW)") | |
| print(f"Mean RMSE: {results_df['rmse'].mean():.2f} MW") | |
| print(f"Mean MAPE: {results_df['mape'].mean():.2f}%") | |
| print(f"\n*** DISTRIBUTION ***") | |
| print(f"MAE: Min={results_df['mae'].min():.2f}, Median={results_df['mae'].median():.2f}, Max={results_df['mae'].max():.2f}") | |
| print(f"RMSE: Min={results_df['rmse'].min():.2f}, Median={results_df['rmse'].median():.2f}, Max={results_df['rmse'].max():.2f}") | |
| print(f"MAPE: Min={results_df['mape'].min():.2f}%, Median={results_df['mape'].median():.2f}%, Max={results_df['mape'].max():.2f}%") | |
| # Target achievement | |
| below_target = (results_df['mae'] <= 150).sum() | |
| print(f"\n*** TARGET ACHIEVEMENT ***") | |
| print(f"Borders with MAE ≤150 MW: {below_target}/{len(results)} ({below_target/len(results)*100:.1f}%)") | |
| # Best and worst performers | |
| print(f"\n*** TOP 5 BEST PERFORMERS (Lowest MAE) ***") | |
| best = results_df.nsmallest(5, 'mae')[['border', 'mae', 'rmse', 'mape']] | |
| for idx, row in best.iterrows(): | |
| print(f" {row['border']:15s}: MAE={row['mae']:6.1f} MW, RMSE={row['rmse']:6.1f} MW, MAPE={row['mape']:5.1f}%") | |
| print(f"\n*** TOP 5 WORST PERFORMERS (Highest MAE) ***") | |
| worst = results_df.nlargest(5, 'mae')[['border', 'mae', 'rmse', 'mape']] | |
| for idx, row in worst.iterrows(): | |
| print(f" {row['border']:15s}: MAE={row['mae']:6.1f} MW, RMSE={row['rmse']:6.1f} MW, MAPE={row['mape']:5.1f}%") | |
| # Save results | |
| output_file = 'results/evaluation_results.csv' | |
| results_df.to_csv(output_file, index=False) | |
| print(f"\n[OK] Detailed results saved to: {output_file}") | |
| print("="*60) | |
| if results_df['mae'].mean() <= 134: | |
| print("[OK] TARGET ACHIEVED! Mean MAE ≤134 MW") | |
| else: | |
| print(f"[!] Target not met. Mean MAE: {results_df['mae'].mean():.2f} MW (Target: ≤134 MW)") | |
| print(" Consider fine-tuning for Phase 2") | |
| print("="*60) | |
| else: | |
| print("[!] No results to evaluate") | |
| # Total time | |
| total_time = time.time() - total_start | |
| print(f"\nTotal evaluation time: {total_time:.1f}s ({total_time / 60:.1f} min)") | |
| if __name__ == '__main__': | |
| main() | |