{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# FBMC Chronos-2 Zero-Shot Inference - Full Production Forecast\n", "\n", "**Production run**: 38 borders × 14 days (336 hours)\n", "\n", "This notebook runs complete zero-shot forecasts for all FBMC borders on HuggingFace Space with GPU." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Environment Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import os\n", "import polars as pl\n", "import torch\n", "from datetime import datetime, timedelta\n", "from datasets import load_dataset\n", "from chronos import ChronosPipeline\n", "import altair as alt\n", "from pathlib import Path\n", "\n", "# Add src to path for imports\n", "import sys\n", "sys.path.append('/home/user/app/src') # HF Space path\n", "\n", "from forecasting.dynamic_forecast import DynamicForecast\n", "from forecasting.feature_availability import FeatureAvailability\n", "\n", "print(\"Environment setup complete\")\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "print(f\"GPU available: {torch.cuda.is_available()}\")\n", "if torch.cuda.is_available():\n", " print(f\"GPU device: {torch.cuda.get_device_name(0)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Extended Dataset from HuggingFace" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading dataset from HuggingFace...\")\n", "start_time = time.time()\n", "\n", "# Load dataset\n", "hf_token = os.getenv(\"HF_TOKEN\")\n", "dataset = load_dataset(\n", " \"evgueni-p/fbmc-features-24month\",\n", " split=\"train\",\n", " token=hf_token\n", ")\n", "\n", "# Convert to Polars\n", "df = pl.from_arrow(dataset.data.table)\n", "\n", "print(f\"✓ Loaded: {df.shape}\")\n", "print(f\" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}\")\n", "print(f\" Load time: {time.time() - start_time:.1f}s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Configure Dynamic Forecast System" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Categorize features by availability\n", "categories = FeatureAvailability.categorize_features(df.columns)\n", "\n", "print(\"Feature categorization:\")\n", "print(f\" Full-horizon D+14: {len(categories['full_horizon_d14'])} features\")\n", "print(f\" Partial D+1: {len(categories['partial_d1'])} features\")\n", "print(f\" Historical only: {len(categories['historical'])} features\")\n", "print(f\" Total: {sum(len(v) for v in categories.values())} features\")\n", "\n", "# Identify target borders\n", "target_cols = [col for col in df.columns if col.startswith('target_border_')]\n", "borders = [col.replace('target_border_', '') for col in target_cols]\n", "print(f\"\\n✓ Found {len(borders)} borders\")\n", "print(f\" Borders: {', '.join(borders[:5])}...\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Load Chronos-2 Model on GPU" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading Chronos-2 Large model...\")\n", "start_time = time.time()\n", "\n", "pipeline = ChronosPipeline.from_pretrained(\n", " \"amazon/chronos-t5-large\",\n", " device_map=\"cuda\",\n", " torch_dtype=torch.bfloat16\n", ")\n", "\n", "print(f\"✓ Model loaded in {time.time() - start_time:.1f}s\")\n", "print(f\" Device: {next(pipeline.model.parameters()).device}\")\n", "print(f\" Dtype: {next(pipeline.model.parameters()).dtype}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Run Zero-Shot Inference for All Borders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Production configuration\n", "prediction_hours = 336 # 14 days\n", "context_hours = 512 # Context window\n", "run_date = datetime(2025, 9, 30, 23, 0) # Sept 30 11 PM\n", "\n", "print(\"Production forecast configuration:\")\n", "print(f\" Run date: {run_date}\")\n", "print(f\" Context: {context_hours} hours\")\n", "print(f\" Forecast: {prediction_hours} hours (14 days)\")\n", "print(f\" Forecast range: Oct 1 00:00 to Oct 14 23:00\")\n", "print(f\" Borders: {len(borders)}\")\n", "print()\n", "\n", "# Initialize dynamic forecast\n", "forecaster = DynamicForecast(\n", " df=df,\n", " feature_categories=categories\n", ")\n", "\n", "# Storage for all forecasts\n", "all_forecasts = {}\n", "inference_times = {}\n", "\n", "# Run inference for each border\n", "total_start = time.time()\n", "\n", "for i, border in enumerate(borders, 1):\n", " print(f\"[{i}/{len(borders)}] Processing {border}...\", end=\" \")\n", " \n", " try:\n", " # Extract data\n", " context_data, future_data = forecaster.prepare_forecast_data(\n", " run_date=run_date,\n", " border=border\n", " )\n", " \n", " # Get context (last 512 hours)\n", " context = context_data.select([border]).to_numpy()[-context_hours:].flatten()\n", " \n", " # Run inference\n", " start_time = time.time()\n", " forecast = pipeline.predict(\n", " context=context,\n", " prediction_length=prediction_hours,\n", " num_samples=20\n", " )\n", " elapsed = time.time() - start_time\n", " \n", " # Store median forecast\n", " forecast_median = forecast.numpy().median(axis=0)\n", " all_forecasts[border] = forecast_median\n", " inference_times[border] = elapsed\n", " \n", " print(f\"✓ {elapsed:.1f}s\")\n", " \n", " except Exception as e:\n", " print(f\"✗ ERROR: {str(e)}\")\n", " all_forecasts[border] = None\n", " inference_times[border] = 0.0\n", "\n", "total_time = time.time() - total_start\n", "\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"INFERENCE COMPLETE\")\n", "print(\"=\"*60)\n", "print(f\"Total time: {total_time/60:.1f} minutes\")\n", "print(f\"Avg per border: {total_time/len(borders):.1f}s\")\n", "print(f\"Successful: {sum(1 for v in all_forecasts.values() if v is not None)}/{len(borders)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Save Forecasts to Parquet" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create timestamp range for forecasts\n", "forecast_timestamps = pl.datetime_range(\n", " datetime(2025, 10, 1, 0, 0),\n", " datetime(2025, 10, 14, 23, 0),\n", " interval='1h',\n", " eager=True\n", ")\n", "\n", "# Build forecast DataFrame\n", "forecast_data = {'timestamp': forecast_timestamps}\n", "for border, forecast in all_forecasts.items():\n", " if forecast is not None:\n", " forecast_data[f'forecast_{border}'] = forecast.tolist()\n", " else:\n", " forecast_data[f'forecast_{border}'] = [None] * len(forecast_timestamps)\n", "\n", "forecast_df = pl.DataFrame(forecast_data)\n", "\n", "# Save to parquet\n", "output_path = Path('/home/user/app/forecasts_14day.parquet')\n", "forecast_df.write_parquet(output_path)\n", "\n", "print(f\"✓ Forecasts saved: {forecast_df.shape}\")\n", "print(f\" File: {output_path}\")\n", "print(f\" Size: {output_path.stat().st_size / 1024 / 1024:.1f} MB\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Visualize Sample Borders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Select 4 representative borders for visualization\n", "sample_borders = borders[:4]\n", "\n", "charts = []\n", "for border in sample_borders:\n", " if all_forecasts[border] is not None:\n", " viz_data = pl.DataFrame({\n", " 'timestamp': forecast_timestamps,\n", " 'forecast': all_forecasts[border].tolist()\n", " })\n", " \n", " chart = alt.Chart(viz_data.to_pandas()).mark_line().encode(\n", " x=alt.X('timestamp:T', title='Date'),\n", " y=alt.Y('forecast:Q', title='Flow (MW)'),\n", " tooltip=['timestamp:T', alt.Tooltip('forecast:Q', format='.0f')]\n", " ).properties(\n", " width=400,\n", " height=200,\n", " title=f'{border}'\n", " )\n", " charts.append(chart)\n", "\n", "# Combine into 2x2 grid\n", "combined = alt.vconcat(\n", " alt.hconcat(charts[0], charts[1]),\n", " alt.hconcat(charts[2], charts[3])\n", ").properties(\n", " title='Sample Zero-Shot Forecasts (Oct 1-14, 2025)'\n", ")\n", "\n", "combined" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. Performance Summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create performance summary\n", "perf_data = pl.DataFrame({\n", " 'border': list(inference_times.keys()),\n", " 'inference_time_s': list(inference_times.values()),\n", " 'status': ['SUCCESS' if all_forecasts[b] is not None else 'FAILED' for b in inference_times.keys()]\n", "}).sort('inference_time_s', descending=True)\n", "\n", "print(\"\\nTop 10 Slowest Borders:\")\n", "print(perf_data.head(10))\n", "\n", "print(\"\\nPerformance Statistics:\")\n", "print(f\" Mean: {perf_data['inference_time_s'].mean():.1f}s\")\n", "print(f\" Median: {perf_data['inference_time_s'].median():.1f}s\")\n", "print(f\" Min: {perf_data['inference_time_s'].min():.1f}s\")\n", "print(f\" Max: {perf_data['inference_time_s'].max():.1f}s\")\n", "\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"PRODUCTION FORECAST COMPLETE\")\n", "print(\"=\"*60)\n", "print(f\"Borders processed: {len(borders)}\")\n", "print(f\"Forecast horizon: 14 days (336 hours)\")\n", "print(f\"Total runtime: {total_time/60:.1f} minutes\")\n", "print(f\"Output: forecasts_14day.parquet\")\n", "print(f\"\\n✓ Ready for evaluation against Oct 1-14 actuals\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }