Spaces:
Sleeping
Sleeping
File size: 8,019 Bytes
330e408 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
"""Extend 24-month dataset with October 2025 features.
Merges October feature files and appends to existing 24-month unified dataset.
Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)
Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import polars as pl
import sys
def merge_october_features() -> pl.DataFrame:
"""Merge October feature files into single dataframe."""
print("\n" + "=" * 80)
print("MERGING OCTOBER FEATURES")
print("=" * 80)
processed_dir = Path("data/processed")
# Load October feature files
weather_file = processed_dir / "features_weather_october.parquet"
entsoe_file = processed_dir / "features_entsoe_october.parquet"
jao_file = processed_dir / "features_jao_october.parquet"
print("\nLoading October features...")
weather_df = pl.read_parquet(weather_file)
# Cast timestamp to nanosecond precision for consistency
weather_df = weather_df.with_columns([
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
])
print(f" Weather: {weather_df.shape}")
entsoe_df = pl.read_parquet(entsoe_file)
# Ensure timestamp is nanosecond precision
entsoe_df = entsoe_df.with_columns([
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
])
print(f" ENTSO-E: {entsoe_df.shape}")
# Check if JAO features exist
if jao_file.exists():
jao_df = pl.read_parquet(jao_file)
print(f" JAO: {jao_df.shape}")
else:
jao_df = None
print(f" JAO: Not available (will use zeros)")
# Merge features
print("\nMerging features...")
unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
print(f" Weather + ENTSO-E: {unified.shape}")
if jao_df is not None:
unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
print(f" + JAO: {unified.shape}")
print(f"\n[OK] October unified features: {unified.shape}")
return unified
def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
"""Append October features to 24-month dataset."""
print("\n" + "=" * 80)
print("EXTENDING 24-MONTH DATASET")
print("=" * 80)
processed_dir = Path("data/processed")
base_file = processed_dir / "features_unified_24month.parquet"
print("\nLoading 24-month dataset...")
base_df = pl.read_parquet(base_file)
print(f" Shape: {base_df.shape}")
print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")
# Match October timestamp precision to base dataset
base_timestamp_dtype = base_df['timestamp'].dtype
october_features = october_features.with_columns([
pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
])
print(f" Matched timestamp precision: {base_timestamp_dtype}")
# Get column lists
base_cols = set(base_df.columns)
october_cols = set(october_features.columns)
# Find missing columns in October (JAO features likely missing)
missing_in_october = base_cols - october_cols
if missing_in_october:
print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
for col in missing_in_october:
if col != 'timestamp':
october_features = october_features.with_columns([
pl.lit(None).cast(base_df[col].dtype).alias(col)
])
# Ensure ALL column dtypes match exactly (not just missing ones)
print("\n Matching column dtypes...")
dtype_fixes = []
for col in base_df.columns:
if col in october_features.columns:
base_dtype = base_df[col].dtype
october_dtype = october_features[col].dtype
if base_dtype != october_dtype:
dtype_fixes.append(col)
october_features = october_features.with_columns([
pl.col(col).cast(base_dtype).alias(col)
])
if dtype_fixes:
print(f" Fixed {len(dtype_fixes)} dtype mismatches")
# Ensure column order matches
october_features = october_features.select(base_df.columns)
print("\nAppending October features...")
extended_df = pl.concat([base_df, october_features], how='vertical')
print(f" Extended shape: {extended_df.shape}")
print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
print(f" Rows added: {len(extended_df) - len(base_df)}")
return extended_df
def validate_extended_dataset(extended_df: pl.DataFrame):
"""Validate extended dataset."""
print("\n" + "=" * 80)
print("VALIDATING EXTENDED DATASET")
print("=" * 80)
expected_rows = 17880 # 24 months + 14 days
expected_cols = 2553 # From metadata
print(f"\nShape validation:")
print(f" Rows: {len(extended_df)} (expected {expected_rows})")
print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})")
# Check for duplicates
duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
print(f"\nDuplicate timestamps: {len(duplicates)}")
# Check for gaps (skip - Duration comparison not supported in this Polars version)
# Just verify continuous hourly data by checking row count matches expected
expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
actual_hours = len(extended_df)
print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")
# Null counts
total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
print(f"\nTotal null values: {total_nulls}")
# Date range
date_start = extended_df['timestamp'].min()
date_end = extended_df['timestamp'].max()
print(f"\nDate range:")
print(f" Start: {date_start}")
print(f" End: {date_end}")
# Validation result
issues = []
if len(extended_df) != expected_rows:
issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
if len(duplicates) > 0:
issues.append(f"Found {len(duplicates)} duplicate timestamps")
if issues:
print("\n[WARNING] Validation issues:")
for issue in issues:
print(f" - {issue}")
return False
else:
print("\n[OK] All validation checks passed!")
return True
def main():
"""Main execution: Merge October features and extend dataset."""
print("\n" + "=" * 80)
print("DATASET EXTENSION: October 2025")
print("Extending 24-month dataset (17,544 -> 17,880 rows)")
print("=" * 80)
try:
# Merge October features
october_features = merge_october_features()
# Extend dataset
extended_df = extend_dataset(october_features)
# Validate
validation_passed = validate_extended_dataset(extended_df)
if validation_passed:
# Save extended dataset
output_file = Path("data/processed/features_unified_extended.parquet")
extended_df.write_parquet(output_file)
print("\n" + "=" * 80)
print("SUCCESS: Dataset extension complete!")
print("=" * 80)
print(f"\nExtended dataset saved:")
print(f" File: {output_file}")
print(f" Shape: {extended_df.shape}")
print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
print("\nNext steps:")
print(" 1. Upload to HuggingFace Datasets")
print(" 2. Create inference notebooks")
print(" 3. Deploy to HF Space")
else:
print("\n[ERROR] Validation failed - please review issues")
sys.exit(1)
except Exception as e:
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
print(f"\n[ERROR] Dataset extension failed: {error_msg}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
|