#!/usr/bin/env python3 """ Feature Availability Module Categorizes 2,514 features by their availability windows for forecasting. Purpose: Prevent data leakage by clearly defining what features are available at run time for different forecast horizons. Categories: 1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA 2. Partial D+1 only (masked D+2-D+14): load forecasts 3. Historical only (not available): prices, generation, demand, lags, etc. """ from typing import Dict, List, Tuple, Set import pandas as pd import numpy as np from datetime import datetime, timedelta class FeatureAvailability: """ Defines availability windows for all features in the dataset. Availability Horizons: - D+14: Available for full 14-day forecast (temporal, weather, outages, LTA) - D+1: Available for day-ahead only (load forecasts) - D+0: Current value only, forward-filled (LTA) - Historical: Not available for future (prices, generation, demand, lags) """ # Feature categories with their availability windows AVAILABILITY_WINDOWS = { # FULL HORIZON - D+14 (336 hours) 'temporal': { 'horizon_hours': float('inf'), # Always computable 'description': 'Time-based features (hour, day, month, weekday, etc.)', 'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'], 'suffixes': ['_sin', '_cos'], 'expected_count': 12, }, 'weather': { 'horizon_hours': 336, # D+14 weather forecasts 'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)', 'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'], 'expected_count': 375, # Approximate (52 grid points × ~7 variables) }, 'cnec_outages': { 'horizon_hours': 336, # D+14+ planned transmission outages 'description': 'Planned CNEC transmission outages (published weeks ahead)', 'prefixes': ['outage_cnec_'], 'expected_count': 176, }, 'lta': { 'horizon_hours': 0, # D+0 only (current value) 'description': 'Long-term allocations (forward-filled from D+0)', 'prefixes': ['lta_'], 'expected_count': 40, 'forward_fill': True, # Special handling: forward-fill current value }, # PARTIAL HORIZON - D+1 only (24 hours) 'load_forecast': { 'horizon_hours': 24, # D+1 only, masked D+2-D+14 'description': 'Day-ahead load forecasts (published D-1)', 'prefixes': ['load_forecast_'], 'expected_count': 12, 'requires_masking': True, # Mask hours 25-336 }, # HISTORICAL ONLY - Not available for forecasting 'prices': { 'horizon_hours': -1, # Historical only 'description': 'Day-ahead electricity prices (determined D-1)', 'prefixes': ['price_'], 'expected_count': 24, }, 'generation': { 'horizon_hours': -1, 'description': 'Actual generation by fuel type', 'prefixes': ['gen_'], 'expected_count': 183, # 12 zones × ~15 fuel types }, 'demand': { 'horizon_hours': -1, 'description': 'Actual electricity demand', 'prefixes': ['demand_'], 'expected_count': 24, # 12 zones + aggregates }, 'border_lags': { 'horizon_hours': -1, 'description': 'Lagged cross-border flows', 'patterns': ['_lag_', '_L', 'border_'], 'expected_count': 264, # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h) }, 'cnec_flows': { 'horizon_hours': -1, 'description': 'Historical CNEC flows and constraints', 'prefixes': ['cnec_'], 'patterns': ['_flow', '_binding', '_margin', '_ram'], 'expected_count': 1000, # Tier-1 CNECs with multiple metrics }, 'netpos': { 'horizon_hours': -1, 'description': 'Historical net positions', 'prefixes': ['netpos_'], 'expected_count': 48, # 12 zones × 4 metrics }, 'system_agg': { 'horizon_hours': -1, 'description': 'System-level aggregates', 'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'], 'expected_count': 353, # Various aggregations }, 'pumped_storage': { 'horizon_hours': -1, 'description': 'Pumped hydro storage generation', 'prefixes': ['pumped_'], 'expected_count': 7, # Countries with pumped storage }, 'hydro_storage': { 'horizon_hours': -1, 'description': 'Hydro reservoir levels (weekly data)', 'prefixes': ['hydro_storage_'], 'expected_count': 7, }, } @classmethod def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]: """ Categorize all features by their availability windows. Args: columns: All column names from dataset Returns: Dictionary with categories: - full_horizon_d14: Available for full 14-day forecast - partial_d1: Available D+1 only (requires masking) - historical: Not available for forecasting - uncategorized: Features that don't match any pattern """ full_horizon_d14 = [] partial_d1 = [] historical = [] uncategorized = [] for col in columns: # Skip metadata columns if col == 'timestamp' or col.startswith('target_border_'): continue categorized = False # Check each category for category, config in cls.AVAILABILITY_WINDOWS.items(): if cls._matches_category(col, config): # Assign to appropriate list based on horizon if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'): full_horizon_d14.append(col) elif config['horizon_hours'] == 24: partial_d1.append(col) elif config['horizon_hours'] < 0: historical.append(col) elif config['horizon_hours'] == 0: # LTA: forward-filled, treat as full horizon full_horizon_d14.append(col) categorized = True break if not categorized: uncategorized.append(col) return { 'full_horizon_d14': full_horizon_d14, 'partial_d1': partial_d1, 'historical': historical, 'uncategorized': uncategorized, } @classmethod def _matches_category(cls, col: str, config: Dict) -> bool: """Check if column matches category patterns.""" # Check exact matches if 'patterns' in config: if col in config['patterns']: return True # Check for pattern substring matches if any(pattern in col for pattern in config['patterns']): return True # Check prefixes if 'prefixes' in config: if any(col.startswith(prefix) for prefix in config['prefixes']): return True # Check suffixes if 'suffixes' in config: if any(col.endswith(suffix) for suffix in config['suffixes']): return True return False @classmethod def create_availability_mask( cls, feature_name: str, forecast_horizon_hours: int = 336 ) -> np.ndarray: """ Create binary availability mask for a feature across forecast horizon. Args: feature_name: Name of the feature forecast_horizon_hours: Length of forecast (default 336 = 14 days) Returns: Binary mask: 1 = available, 0 = masked/unavailable """ # Determine category for category, config in cls.AVAILABILITY_WINDOWS.items(): if cls._matches_category(feature_name, config): horizon = config['horizon_hours'] # Full horizon or infinite (temporal) if horizon >= forecast_horizon_hours or horizon == float('inf'): return np.ones(forecast_horizon_hours, dtype=np.float32) # Partial horizon (e.g., D+1 = 24 hours) elif horizon > 0: mask = np.zeros(forecast_horizon_hours, dtype=np.float32) mask[:int(horizon)] = 1.0 return mask # Forward-fill (LTA: D+0) elif horizon == 0: return np.ones(forecast_horizon_hours, dtype=np.float32) # Historical only else: return np.zeros(forecast_horizon_hours, dtype=np.float32) # Unknown feature: assume historical (conservative) return np.zeros(forecast_horizon_hours, dtype=np.float32) @classmethod def validate_categorization( cls, categories: Dict[str, List[str]], verbose: bool = True ) -> Tuple[bool, List[str]]: """ Validate feature categorization against expected counts. Args: categories: Output from categorize_features() verbose: Print validation details Returns: (is_valid, warnings) """ warnings = [] # Total feature count (excl. timestamp + 38 targets) total_features = sum(len(v) for v in categories.values()) expected_total = 2514 # 2,553 columns - 1 timestamp - 38 targets if total_features != expected_total: warnings.append( f"Feature count mismatch: {total_features} vs expected {expected_total}" ) # Check full-horizon D+14 features full_d14 = len(categories['full_horizon_d14']) # Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603 if full_d14 < 200 or full_d14 > 700: warnings.append( f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)" ) # Check partial D+1 features partial_d1 = len(categories['partial_d1']) if partial_d1 != 12: warnings.append( f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)" ) # Check uncategorized if categories['uncategorized']: warnings.append( f"Uncategorized features: {len(categories['uncategorized'])} " f"(first 5: {categories['uncategorized'][:5]})" ) if verbose: print("="*60) print("FEATURE CATEGORIZATION VALIDATION") print("="*60) print(f"Full-horizon D+14: {len(categories['full_horizon_d14']):4d} features") print(f"Partial D+1: {len(categories['partial_d1']):4d} features") print(f"Historical only: {len(categories['historical']):4d} features") print(f"Uncategorized: {len(categories['uncategorized']):4d} features") print(f"Total: {total_features:4d} features") if warnings: print("\n[!] WARNINGS:") for w in warnings: print(f" - {w}") else: print("\n[OK] Validation passed!") print("="*60) return len(warnings) == 0, warnings @classmethod def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame: """ Generate summary table of feature categorization. Returns: DataFrame with category, count, availability, and sample features """ summary = [] # Full-horizon D+14 summary.append({ 'Category': 'Full-horizon D+14', 'Count': len(categories['full_horizon_d14']), 'Availability': 'D+1 to D+14 (336 hours)', 'Masking': 'None', 'Sample Features': ', '.join(categories['full_horizon_d14'][:3]), }) # Partial D+1 summary.append({ 'Category': 'Partial D+1', 'Count': len(categories['partial_d1']), 'Availability': 'D+1 only (24 hours)', 'Masking': 'Mask D+2 to D+14', 'Sample Features': ', '.join(categories['partial_d1'][:3]), }) # Historical summary.append({ 'Category': 'Historical only', 'Count': len(categories['historical']), 'Availability': 'Not available for forecasting', 'Masking': 'All zeros', 'Sample Features': ', '.join(categories['historical'][:3]), }) # Uncategorized if categories['uncategorized']: summary.append({ 'Category': 'Uncategorized', 'Count': len(categories['uncategorized']), 'Availability': 'Unknown (conservative: historical)', 'Masking': 'All zeros (conservative)', 'Sample Features': ', '.join(categories['uncategorized'][:3]), }) return pd.DataFrame(summary)