Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 8,019 Bytes

330e408

"""Extend 24-month dataset with October 2025 features.

Merges October feature files and appends to existing 24-month unified dataset.
Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)

Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import polars as pl
import sys


def merge_october_features() -> pl.DataFrame:
    """Merge October feature files into single dataframe."""
    print("\n" + "=" * 80)
    print("MERGING OCTOBER FEATURES")
    print("=" * 80)

    processed_dir = Path("data/processed")

    # Load October feature files
    weather_file = processed_dir / "features_weather_october.parquet"
    entsoe_file = processed_dir / "features_entsoe_october.parquet"
    jao_file = processed_dir / "features_jao_october.parquet"

    print("\nLoading October features...")
    weather_df = pl.read_parquet(weather_file)
    # Cast timestamp to nanosecond precision for consistency
    weather_df = weather_df.with_columns([
        pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
    ])
    print(f"  Weather: {weather_df.shape}")

    entsoe_df = pl.read_parquet(entsoe_file)
    # Ensure timestamp is nanosecond precision
    entsoe_df = entsoe_df.with_columns([
        pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
    ])
    print(f"  ENTSO-E: {entsoe_df.shape}")

    # Check if JAO features exist
    if jao_file.exists():
        jao_df = pl.read_parquet(jao_file)
        print(f"  JAO: {jao_df.shape}")
    else:
        jao_df = None
        print(f"  JAO: Not available (will use zeros)")

    # Merge features
    print("\nMerging features...")
    unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
    print(f"  Weather + ENTSO-E: {unified.shape}")

    if jao_df is not None:
        unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
        print(f"  + JAO: {unified.shape}")

    print(f"\n[OK] October unified features: {unified.shape}")
    return unified


def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
    """Append October features to 24-month dataset."""
    print("\n" + "=" * 80)
    print("EXTENDING 24-MONTH DATASET")
    print("=" * 80)

    processed_dir = Path("data/processed")
    base_file = processed_dir / "features_unified_24month.parquet"

    print("\nLoading 24-month dataset...")
    base_df = pl.read_parquet(base_file)
    print(f"  Shape: {base_df.shape}")
    print(f"  Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")

    # Match October timestamp precision to base dataset
    base_timestamp_dtype = base_df['timestamp'].dtype
    october_features = october_features.with_columns([
        pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
    ])
    print(f"  Matched timestamp precision: {base_timestamp_dtype}")

    # Get column lists
    base_cols = set(base_df.columns)
    october_cols = set(october_features.columns)

    # Find missing columns in October (JAO features likely missing)
    missing_in_october = base_cols - october_cols
    if missing_in_october:
        print(f"\n  Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
        for col in missing_in_october:
            if col != 'timestamp':
                october_features = october_features.with_columns([
                    pl.lit(None).cast(base_df[col].dtype).alias(col)
                ])

    # Ensure ALL column dtypes match exactly (not just missing ones)
    print("\n  Matching column dtypes...")
    dtype_fixes = []
    for col in base_df.columns:
        if col in october_features.columns:
            base_dtype = base_df[col].dtype
            october_dtype = october_features[col].dtype
            if base_dtype != october_dtype:
                dtype_fixes.append(col)
                october_features = october_features.with_columns([
                    pl.col(col).cast(base_dtype).alias(col)
                ])

    if dtype_fixes:
        print(f"  Fixed {len(dtype_fixes)} dtype mismatches")

    # Ensure column order matches
    october_features = october_features.select(base_df.columns)

    print("\nAppending October features...")
    extended_df = pl.concat([base_df, october_features], how='vertical')

    print(f"  Extended shape: {extended_df.shape}")
    print(f"  Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
    print(f"  Rows added: {len(extended_df) - len(base_df)}")

    return extended_df


def validate_extended_dataset(extended_df: pl.DataFrame):
    """Validate extended dataset."""
    print("\n" + "=" * 80)
    print("VALIDATING EXTENDED DATASET")
    print("=" * 80)

    expected_rows = 17880  # 24 months + 14 days
    expected_cols = 2553   # From metadata

    print(f"\nShape validation:")
    print(f"  Rows: {len(extended_df)} (expected {expected_rows})")
    print(f"  Columns: {len(extended_df.columns)} (expected {expected_cols})")

    # Check for duplicates
    duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
    print(f"\nDuplicate timestamps: {len(duplicates)}")

    # Check for gaps (skip - Duration comparison not supported in this Polars version)
    # Just verify continuous hourly data by checking row count matches expected
    expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
    actual_hours = len(extended_df)
    print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")

    # Null counts
    total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
    print(f"\nTotal null values: {total_nulls}")

    # Date range
    date_start = extended_df['timestamp'].min()
    date_end = extended_df['timestamp'].max()
    print(f"\nDate range:")
    print(f"  Start: {date_start}")
    print(f"  End: {date_end}")

    # Validation result
    issues = []
    if len(extended_df) != expected_rows:
        issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
    if len(duplicates) > 0:
        issues.append(f"Found {len(duplicates)} duplicate timestamps")

    if issues:
        print("\n[WARNING] Validation issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print("\n[OK] All validation checks passed!")
        return True


def main():
    """Main execution: Merge October features and extend dataset."""
    print("\n" + "=" * 80)
    print("DATASET EXTENSION: October 2025")
    print("Extending 24-month dataset (17,544 -> 17,880 rows)")
    print("=" * 80)

    try:
        # Merge October features
        october_features = merge_october_features()

        # Extend dataset
        extended_df = extend_dataset(october_features)

        # Validate
        validation_passed = validate_extended_dataset(extended_df)

        if validation_passed:
            # Save extended dataset
            output_file = Path("data/processed/features_unified_extended.parquet")
            extended_df.write_parquet(output_file)

            print("\n" + "=" * 80)
            print("SUCCESS: Dataset extension complete!")
            print("=" * 80)
            print(f"\nExtended dataset saved:")
            print(f"  File: {output_file}")
            print(f"  Shape: {extended_df.shape}")
            print(f"  Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
            print("\nNext steps:")
            print("  1. Upload to HuggingFace Datasets")
            print("  2. Create inference notebooks")
            print("  3. Deploy to HF Space")
        else:
            print("\n[ERROR] Validation failed - please review issues")
            sys.exit(1)

    except Exception as e:
        error_msg = str(e).encode('ascii', 'replace').decode('ascii')
        print(f"\n[ERROR] Dataset extension failed: {error_msg}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()