Spaces:
Sleeping
Sleeping
| """HuggingFace Datasets manager for FBMC data storage. | |
| This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets. | |
| Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS) | |
| """ | |
| import polars as pl | |
| from datasets import Dataset, DatasetDict | |
| from huggingface_hub import HfApi | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| import os | |
| from typing import Optional | |
| class FBMCDatasetManager: | |
| """Manage FBMC data uploads/downloads via HuggingFace Datasets.""" | |
| def __init__(self): | |
| """Initialize with HF credentials from .env file.""" | |
| # Load environment variables from .env | |
| load_dotenv() | |
| self.hf_token = os.getenv('HF_TOKEN') | |
| self.hf_username = os.getenv('HF_USERNAME') | |
| if not self.hf_token or 'your_hf' in self.hf_token.lower(): | |
| print("⚠️ HF token not configured - upload features disabled") | |
| self.api = None | |
| else: | |
| self.api = HfApi(token=self.hf_token) | |
| def upload_dataset( | |
| self, | |
| parquet_path: Path, | |
| dataset_name: str, | |
| description: str = "", | |
| private: bool = False | |
| ) -> Optional[str]: | |
| """Upload Parquet file to HuggingFace Datasets. | |
| Args: | |
| parquet_path: Path to local Parquet file | |
| dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025') | |
| description: Optional dataset description | |
| private: Whether dataset should be private (default: False for free storage) | |
| Returns: | |
| Full dataset name (username/dataset-name) or None if upload fails | |
| """ | |
| if not self.api: | |
| print("❌ Cannot upload: HF token not configured") | |
| return None | |
| print(f"📤 Uploading {parquet_path.name} to HF Datasets...") | |
| try: | |
| # Load Parquet as polars, convert to HF Dataset | |
| df = pl.read_parquet(parquet_path) | |
| dataset = Dataset.from_pandas(df.to_pandas()) | |
| # Create full dataset name | |
| full_name = f"{self.hf_username}/{dataset_name}" | |
| # Upload to HF | |
| dataset.push_to_hub( | |
| full_name, | |
| token=self.hf_token, | |
| private=private | |
| ) | |
| print(f"✅ Uploaded to: https://huggingface.co/datasets/{full_name}") | |
| return full_name | |
| except Exception as e: | |
| print(f"❌ Upload failed: {e}") | |
| return None | |
| def download_dataset( | |
| self, | |
| dataset_name: str, | |
| output_path: Path, | |
| split: str = "train" | |
| ) -> Optional[pl.DataFrame]: | |
| """Download dataset from HF to local Parquet file. | |
| Args: | |
| dataset_name: HF dataset name (with or without username prefix) | |
| output_path: Local path to save Parquet file | |
| split: Dataset split to download (default: 'train') | |
| Returns: | |
| Polars DataFrame or None if download fails | |
| """ | |
| from datasets import load_dataset | |
| # Add username prefix if not present | |
| if '/' not in dataset_name: | |
| dataset_name = f"{self.hf_username}/{dataset_name}" | |
| print(f"📥 Downloading {dataset_name} from HF Datasets...") | |
| try: | |
| # Download from HF | |
| dataset = load_dataset(dataset_name, split=split) | |
| # Convert to polars and save | |
| df = pl.from_pandas(dataset.to_pandas()) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| df.write_parquet(output_path) | |
| print(f"✅ Downloaded to: {output_path}") | |
| print(f" Shape: {df.shape}") | |
| return df | |
| except Exception as e: | |
| print(f"❌ Download failed: {e}") | |
| return None | |
| def list_datasets(self, filter_fbmc: bool = True) -> list: | |
| """List all datasets for this user. | |
| Args: | |
| filter_fbmc: Only show FBMC-related datasets (default: True) | |
| Returns: | |
| List of dataset info dictionaries | |
| """ | |
| if not self.api: | |
| print("❌ Cannot list: HF token not configured") | |
| return [] | |
| try: | |
| datasets = list(self.api.list_datasets(author=self.hf_username)) | |
| if filter_fbmc: | |
| datasets = [d for d in datasets if 'fbmc' in d.id.lower()] | |
| print(f"\n📊 {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:") | |
| for ds in datasets: | |
| print(f" - {ds.id}") | |
| return datasets | |
| except Exception as e: | |
| print(f"❌ List failed: {e}") | |
| return [] | |
| # Example usage | |
| if __name__ == "__main__": | |
| manager = FBMCDatasetManager() | |
| # Test configuration | |
| print("HF Datasets Manager initialized") | |
| print(f"Username: {manager.hf_username}") | |
| print(f"Token configured: {manager.api is not None}") | |
| # Upload example (will be used in Day 1) | |
| # manager.upload_dataset( | |
| # parquet_path=Path("data/raw/cnecs_2024_2025.parquet"), | |
| # dataset_name="fbmc-cnecs-2024-2025", | |
| # description="FBMC CNECs data: Oct 2024 - Sept 2025" | |
| # ) | |
| # Download example (will be used when setting up new environments) | |
| # manager.download_dataset( | |
| # dataset_name="fbmc-cnecs-2024-2025", | |
| # output_path=Path("data/raw/cnecs_2024_2025.parquet") | |
| # ) | |