Spaces:
Running
Running
Commit
·
d51552b
1
Parent(s):
2d9152d
feat: Add dataset streaming support for faster UI load times
Browse filesEnable HuggingFace dataset streaming by default to improve UX by loading
data progressively instead of blocking UI during downloads.
Changes:
- Add use_streaming parameter to DataLoader (default: True)
- Update all HF dataset load methods to support streaming mode
- Streaming converts data on-the-fly without full download
- Return types unchanged (pd.DataFrame, List[Dict]) - UI compatible
- Can disable via USE_STREAMING=false env variable
Benefits:
- Faster initial page load (starts streaming immediately)
- Better UX (no blank screen while downloading)
- Data still cached after first load for subsequent access
- Backward compatible with non-streaming mode
- data_loader.py +60 -20
data_loader.py
CHANGED
|
@@ -32,12 +32,14 @@ class DataLoader:
|
|
| 32 |
data_source: DataSource = "both",
|
| 33 |
json_data_path: Optional[str] = None,
|
| 34 |
leaderboard_dataset: Optional[str] = None,
|
| 35 |
-
hf_token: Optional[str] = None
|
|
|
|
| 36 |
):
|
| 37 |
self.data_source = data_source
|
| 38 |
self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
|
| 39 |
self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "kshitijthakkar/smoltrace-leaderboard")
|
| 40 |
self.hf_token = hf_token or os.getenv("HF_TOKEN")
|
|
|
|
| 41 |
|
| 42 |
# Cache
|
| 43 |
self._cache: Dict[str, Any] = {}
|
|
@@ -79,11 +81,26 @@ class DataLoader:
|
|
| 79 |
raise ValueError("No valid data source available")
|
| 80 |
|
| 81 |
def _load_leaderboard_from_hf(self) -> pd.DataFrame:
|
| 82 |
-
"""Load leaderboard from HuggingFace dataset"""
|
| 83 |
try:
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return df
|
| 88 |
except Exception as e:
|
| 89 |
print(f"[ERROR] Loading from HuggingFace: {e}")
|
|
@@ -142,10 +159,17 @@ class DataLoader:
|
|
| 142 |
raise ValueError("No valid data source available")
|
| 143 |
|
| 144 |
def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 145 |
-
"""Load results from HuggingFace dataset"""
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
return df
|
| 150 |
|
| 151 |
def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
|
|
@@ -203,10 +227,16 @@ class DataLoader:
|
|
| 203 |
raise ValueError("No valid data source available")
|
| 204 |
|
| 205 |
def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
|
| 206 |
-
"""Load traces from HuggingFace dataset"""
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return traces
|
| 211 |
|
| 212 |
def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
|
|
@@ -264,16 +294,24 @@ class DataLoader:
|
|
| 264 |
return pd.DataFrame()
|
| 265 |
|
| 266 |
def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 267 |
-
"""Load metrics from HuggingFace dataset (flat format)"""
|
| 268 |
-
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
# Convert timestamp strings to datetime if needed
|
| 272 |
-
if 'timestamp' in df.columns:
|
| 273 |
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 274 |
|
| 275 |
-
|
| 276 |
-
|
| 277 |
return df
|
| 278 |
|
| 279 |
def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
|
|
@@ -421,10 +459,12 @@ def create_data_loader_from_env() -> DataLoader:
|
|
| 421 |
Configured DataLoader instance
|
| 422 |
"""
|
| 423 |
data_source = os.getenv("DATA_SOURCE", "both")
|
|
|
|
| 424 |
|
| 425 |
return DataLoader(
|
| 426 |
data_source=data_source,
|
| 427 |
json_data_path=os.getenv("JSON_DATA_PATH"),
|
| 428 |
leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
|
| 429 |
-
hf_token=os.getenv("HF_TOKEN")
|
|
|
|
| 430 |
)
|
|
|
|
| 32 |
data_source: DataSource = "both",
|
| 33 |
json_data_path: Optional[str] = None,
|
| 34 |
leaderboard_dataset: Optional[str] = None,
|
| 35 |
+
hf_token: Optional[str] = None,
|
| 36 |
+
use_streaming: bool = True
|
| 37 |
):
|
| 38 |
self.data_source = data_source
|
| 39 |
self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
|
| 40 |
self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "kshitijthakkar/smoltrace-leaderboard")
|
| 41 |
self.hf_token = hf_token or os.getenv("HF_TOKEN")
|
| 42 |
+
self.use_streaming = use_streaming
|
| 43 |
|
| 44 |
# Cache
|
| 45 |
self._cache: Dict[str, Any] = {}
|
|
|
|
| 81 |
raise ValueError("No valid data source available")
|
| 82 |
|
| 83 |
def _load_leaderboard_from_hf(self) -> pd.DataFrame:
|
| 84 |
+
"""Load leaderboard from HuggingFace dataset with optional streaming"""
|
| 85 |
try:
|
| 86 |
+
if self.use_streaming:
|
| 87 |
+
print("[INFO] Loading leaderboard with streaming...")
|
| 88 |
+
# Load with streaming for faster initial response
|
| 89 |
+
ds = load_dataset(
|
| 90 |
+
self.leaderboard_dataset,
|
| 91 |
+
split="train",
|
| 92 |
+
token=self.hf_token,
|
| 93 |
+
streaming=True
|
| 94 |
+
)
|
| 95 |
+
# Convert streamed data to list of dicts, then to DataFrame
|
| 96 |
+
data = list(ds)
|
| 97 |
+
df = pd.DataFrame(data)
|
| 98 |
+
print(f"[OK] Streamed leaderboard from HuggingFace: {len(df)} rows")
|
| 99 |
+
else:
|
| 100 |
+
# Traditional full download
|
| 101 |
+
ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
|
| 102 |
+
df = ds.to_pandas()
|
| 103 |
+
print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
|
| 104 |
return df
|
| 105 |
except Exception as e:
|
| 106 |
print(f"[ERROR] Loading from HuggingFace: {e}")
|
|
|
|
| 159 |
raise ValueError("No valid data source available")
|
| 160 |
|
| 161 |
def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 162 |
+
"""Load results from HuggingFace dataset with optional streaming"""
|
| 163 |
+
if self.use_streaming:
|
| 164 |
+
print(f"[INFO] Streaming results from {dataset_id}...")
|
| 165 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
|
| 166 |
+
data = list(ds)
|
| 167 |
+
df = pd.DataFrame(data)
|
| 168 |
+
print(f"[OK] Streamed results from HuggingFace: {len(df)} rows")
|
| 169 |
+
else:
|
| 170 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 171 |
+
df = ds.to_pandas()
|
| 172 |
+
print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
|
| 173 |
return df
|
| 174 |
|
| 175 |
def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
|
|
|
|
| 227 |
raise ValueError("No valid data source available")
|
| 228 |
|
| 229 |
def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
|
| 230 |
+
"""Load traces from HuggingFace dataset with optional streaming"""
|
| 231 |
+
if self.use_streaming:
|
| 232 |
+
print(f"[INFO] Streaming traces from {dataset_id}...")
|
| 233 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
|
| 234 |
+
traces = list(ds)
|
| 235 |
+
print(f"[OK] Streamed traces from HuggingFace: {len(traces)} traces")
|
| 236 |
+
else:
|
| 237 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 238 |
+
traces = ds.to_pandas().to_dict("records")
|
| 239 |
+
print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
|
| 240 |
return traces
|
| 241 |
|
| 242 |
def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
|
|
|
|
| 294 |
return pd.DataFrame()
|
| 295 |
|
| 296 |
def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 297 |
+
"""Load metrics from HuggingFace dataset (flat format) with optional streaming"""
|
| 298 |
+
if self.use_streaming:
|
| 299 |
+
print(f"[INFO] Streaming metrics from {dataset_id}...")
|
| 300 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
|
| 301 |
+
data = list(ds)
|
| 302 |
+
df = pd.DataFrame(data)
|
| 303 |
+
print(f"[OK] Streamed metrics from HuggingFace: {len(df)} rows")
|
| 304 |
+
else:
|
| 305 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 306 |
+
df = ds.to_pandas()
|
| 307 |
+
print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
|
| 308 |
|
| 309 |
# Convert timestamp strings to datetime if needed
|
| 310 |
+
if 'timestamp' in df.columns and not df.empty:
|
| 311 |
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 312 |
|
| 313 |
+
if not df.empty:
|
| 314 |
+
print(f" Columns: {list(df.columns)}")
|
| 315 |
return df
|
| 316 |
|
| 317 |
def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
|
|
|
|
| 459 |
Configured DataLoader instance
|
| 460 |
"""
|
| 461 |
data_source = os.getenv("DATA_SOURCE", "both")
|
| 462 |
+
use_streaming = os.getenv("USE_STREAMING", "true").lower() == "true"
|
| 463 |
|
| 464 |
return DataLoader(
|
| 465 |
data_source=data_source,
|
| 466 |
json_data_path=os.getenv("JSON_DATA_PATH"),
|
| 467 |
leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
|
| 468 |
+
hf_token=os.getenv("HF_TOKEN"),
|
| 469 |
+
use_streaming=use_streaming
|
| 470 |
)
|