"""HuggingFace Datasets manager for FBMC data storage. This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets. Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS) """ import polars as pl from datasets import Dataset, DatasetDict from huggingface_hub import HfApi from pathlib import Path from dotenv import load_dotenv import os from typing import Optional class FBMCDatasetManager: """Manage FBMC data uploads/downloads via HuggingFace Datasets.""" def __init__(self): """Initialize with HF credentials from .env file.""" # Load environment variables from .env load_dotenv() self.hf_token = os.getenv('HF_TOKEN') self.hf_username = os.getenv('HF_USERNAME') if not self.hf_token or 'your_hf' in self.hf_token.lower(): print("āš ļø HF token not configured - upload features disabled") self.api = None else: self.api = HfApi(token=self.hf_token) def upload_dataset( self, parquet_path: Path, dataset_name: str, description: str = "", private: bool = False ) -> Optional[str]: """Upload Parquet file to HuggingFace Datasets. Args: parquet_path: Path to local Parquet file dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025') description: Optional dataset description private: Whether dataset should be private (default: False for free storage) Returns: Full dataset name (username/dataset-name) or None if upload fails """ if not self.api: print("āŒ Cannot upload: HF token not configured") return None print(f"šŸ“¤ Uploading {parquet_path.name} to HF Datasets...") try: # Load Parquet as polars, convert to HF Dataset df = pl.read_parquet(parquet_path) dataset = Dataset.from_pandas(df.to_pandas()) # Create full dataset name full_name = f"{self.hf_username}/{dataset_name}" # Upload to HF dataset.push_to_hub( full_name, token=self.hf_token, private=private ) print(f"āœ… Uploaded to: https://huggingface.co/datasets/{full_name}") return full_name except Exception as e: print(f"āŒ Upload failed: {e}") return None def download_dataset( self, dataset_name: str, output_path: Path, split: str = "train" ) -> Optional[pl.DataFrame]: """Download dataset from HF to local Parquet file. Args: dataset_name: HF dataset name (with or without username prefix) output_path: Local path to save Parquet file split: Dataset split to download (default: 'train') Returns: Polars DataFrame or None if download fails """ from datasets import load_dataset # Add username prefix if not present if '/' not in dataset_name: dataset_name = f"{self.hf_username}/{dataset_name}" print(f"šŸ“„ Downloading {dataset_name} from HF Datasets...") try: # Download from HF dataset = load_dataset(dataset_name, split=split) # Convert to polars and save df = pl.from_pandas(dataset.to_pandas()) output_path.parent.mkdir(parents=True, exist_ok=True) df.write_parquet(output_path) print(f"āœ… Downloaded to: {output_path}") print(f" Shape: {df.shape}") return df except Exception as e: print(f"āŒ Download failed: {e}") return None def list_datasets(self, filter_fbmc: bool = True) -> list: """List all datasets for this user. Args: filter_fbmc: Only show FBMC-related datasets (default: True) Returns: List of dataset info dictionaries """ if not self.api: print("āŒ Cannot list: HF token not configured") return [] try: datasets = list(self.api.list_datasets(author=self.hf_username)) if filter_fbmc: datasets = [d for d in datasets if 'fbmc' in d.id.lower()] print(f"\nšŸ“Š {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:") for ds in datasets: print(f" - {ds.id}") return datasets except Exception as e: print(f"āŒ List failed: {e}") return [] # Example usage if __name__ == "__main__": manager = FBMCDatasetManager() # Test configuration print("HF Datasets Manager initialized") print(f"Username: {manager.hf_username}") print(f"Token configured: {manager.api is not None}") # Upload example (will be used in Day 1) # manager.upload_dataset( # parquet_path=Path("data/raw/cnecs_2024_2025.parquet"), # dataset_name="fbmc-cnecs-2024-2025", # description="FBMC CNECs data: Oct 2024 - Sept 2025" # ) # Download example (will be used when setting up new environments) # manager.download_dataset( # dataset_name="fbmc-cnecs-2024-2025", # output_path=Path("data/raw/cnecs_2024_2025.parquet") # )