fbmc-chronos2 / src /data_collection /hf_datasets_manager.py
Evgueni Poloukarov
feat: Day 0 - Initialize FBMC Flow Forecasting MVP
4202f60
raw
history blame
5.39 kB
"""HuggingFace Datasets manager for FBMC data storage.
This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets.
Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS)
"""
import polars as pl
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
from pathlib import Path
from dotenv import load_dotenv
import os
from typing import Optional
class FBMCDatasetManager:
"""Manage FBMC data uploads/downloads via HuggingFace Datasets."""
def __init__(self):
"""Initialize with HF credentials from .env file."""
# Load environment variables from .env
load_dotenv()
self.hf_token = os.getenv('HF_TOKEN')
self.hf_username = os.getenv('HF_USERNAME')
if not self.hf_token or 'your_hf' in self.hf_token.lower():
print("⚠️ HF token not configured - upload features disabled")
self.api = None
else:
self.api = HfApi(token=self.hf_token)
def upload_dataset(
self,
parquet_path: Path,
dataset_name: str,
description: str = "",
private: bool = False
) -> Optional[str]:
"""Upload Parquet file to HuggingFace Datasets.
Args:
parquet_path: Path to local Parquet file
dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025')
description: Optional dataset description
private: Whether dataset should be private (default: False for free storage)
Returns:
Full dataset name (username/dataset-name) or None if upload fails
"""
if not self.api:
print("❌ Cannot upload: HF token not configured")
return None
print(f"📤 Uploading {parquet_path.name} to HF Datasets...")
try:
# Load Parquet as polars, convert to HF Dataset
df = pl.read_parquet(parquet_path)
dataset = Dataset.from_pandas(df.to_pandas())
# Create full dataset name
full_name = f"{self.hf_username}/{dataset_name}"
# Upload to HF
dataset.push_to_hub(
full_name,
token=self.hf_token,
private=private
)
print(f"✅ Uploaded to: https://huggingface.co/datasets/{full_name}")
return full_name
except Exception as e:
print(f"❌ Upload failed: {e}")
return None
def download_dataset(
self,
dataset_name: str,
output_path: Path,
split: str = "train"
) -> Optional[pl.DataFrame]:
"""Download dataset from HF to local Parquet file.
Args:
dataset_name: HF dataset name (with or without username prefix)
output_path: Local path to save Parquet file
split: Dataset split to download (default: 'train')
Returns:
Polars DataFrame or None if download fails
"""
from datasets import load_dataset
# Add username prefix if not present
if '/' not in dataset_name:
dataset_name = f"{self.hf_username}/{dataset_name}"
print(f"📥 Downloading {dataset_name} from HF Datasets...")
try:
# Download from HF
dataset = load_dataset(dataset_name, split=split)
# Convert to polars and save
df = pl.from_pandas(dataset.to_pandas())
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(output_path)
print(f"✅ Downloaded to: {output_path}")
print(f" Shape: {df.shape}")
return df
except Exception as e:
print(f"❌ Download failed: {e}")
return None
def list_datasets(self, filter_fbmc: bool = True) -> list:
"""List all datasets for this user.
Args:
filter_fbmc: Only show FBMC-related datasets (default: True)
Returns:
List of dataset info dictionaries
"""
if not self.api:
print("❌ Cannot list: HF token not configured")
return []
try:
datasets = list(self.api.list_datasets(author=self.hf_username))
if filter_fbmc:
datasets = [d for d in datasets if 'fbmc' in d.id.lower()]
print(f"\n📊 {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:")
for ds in datasets:
print(f" - {ds.id}")
return datasets
except Exception as e:
print(f"❌ List failed: {e}")
return []
# Example usage
if __name__ == "__main__":
manager = FBMCDatasetManager()
# Test configuration
print("HF Datasets Manager initialized")
print(f"Username: {manager.hf_username}")
print(f"Token configured: {manager.api is not None}")
# Upload example (will be used in Day 1)
# manager.upload_dataset(
# parquet_path=Path("data/raw/cnecs_2024_2025.parquet"),
# dataset_name="fbmc-cnecs-2024-2025",
# description="FBMC CNECs data: Oct 2024 - Sept 2025"
# )
# Download example (will be used when setting up new environments)
# manager.download_dataset(
# dataset_name="fbmc-cnecs-2024-2025",
# output_path=Path("data/raw/cnecs_2024_2025.parquet")
# )