kshitijthakkar commited on
Commit
d51552b
·
1 Parent(s): 2d9152d

feat: Add dataset streaming support for faster UI load times

Browse files

Enable HuggingFace dataset streaming by default to improve UX by loading
data progressively instead of blocking UI during downloads.

Changes:
- Add use_streaming parameter to DataLoader (default: True)
- Update all HF dataset load methods to support streaming mode
- Streaming converts data on-the-fly without full download
- Return types unchanged (pd.DataFrame, List[Dict]) - UI compatible
- Can disable via USE_STREAMING=false env variable

Benefits:
- Faster initial page load (starts streaming immediately)
- Better UX (no blank screen while downloading)
- Data still cached after first load for subsequent access
- Backward compatible with non-streaming mode

Files changed (1) hide show
  1. data_loader.py +60 -20
data_loader.py CHANGED
@@ -32,12 +32,14 @@ class DataLoader:
32
  data_source: DataSource = "both",
33
  json_data_path: Optional[str] = None,
34
  leaderboard_dataset: Optional[str] = None,
35
- hf_token: Optional[str] = None
 
36
  ):
37
  self.data_source = data_source
38
  self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
39
  self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "kshitijthakkar/smoltrace-leaderboard")
40
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
 
41
 
42
  # Cache
43
  self._cache: Dict[str, Any] = {}
@@ -79,11 +81,26 @@ class DataLoader:
79
  raise ValueError("No valid data source available")
80
 
81
  def _load_leaderboard_from_hf(self) -> pd.DataFrame:
82
- """Load leaderboard from HuggingFace dataset"""
83
  try:
84
- ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
85
- df = ds.to_pandas()
86
- print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  return df
88
  except Exception as e:
89
  print(f"[ERROR] Loading from HuggingFace: {e}")
@@ -142,10 +159,17 @@ class DataLoader:
142
  raise ValueError("No valid data source available")
143
 
144
  def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
145
- """Load results from HuggingFace dataset"""
146
- ds = load_dataset(dataset_id, split="train", token=self.hf_token)
147
- df = ds.to_pandas()
148
- print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
 
 
 
 
 
 
 
149
  return df
150
 
151
  def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
@@ -203,10 +227,16 @@ class DataLoader:
203
  raise ValueError("No valid data source available")
204
 
205
  def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
206
- """Load traces from HuggingFace dataset"""
207
- ds = load_dataset(dataset_id, split="train", token=self.hf_token)
208
- traces = ds.to_pandas().to_dict("records")
209
- print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
 
 
 
 
 
 
210
  return traces
211
 
212
  def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
@@ -264,16 +294,24 @@ class DataLoader:
264
  return pd.DataFrame()
265
 
266
  def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
267
- """Load metrics from HuggingFace dataset (flat format)"""
268
- ds = load_dataset(dataset_id, split="train", token=self.hf_token)
269
- df = ds.to_pandas()
 
 
 
 
 
 
 
 
270
 
271
  # Convert timestamp strings to datetime if needed
272
- if 'timestamp' in df.columns:
273
  df['timestamp'] = pd.to_datetime(df['timestamp'])
274
 
275
- print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
276
- print(f" Columns: {list(df.columns)}")
277
  return df
278
 
279
  def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
@@ -421,10 +459,12 @@ def create_data_loader_from_env() -> DataLoader:
421
  Configured DataLoader instance
422
  """
423
  data_source = os.getenv("DATA_SOURCE", "both")
 
424
 
425
  return DataLoader(
426
  data_source=data_source,
427
  json_data_path=os.getenv("JSON_DATA_PATH"),
428
  leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
429
- hf_token=os.getenv("HF_TOKEN")
 
430
  )
 
32
  data_source: DataSource = "both",
33
  json_data_path: Optional[str] = None,
34
  leaderboard_dataset: Optional[str] = None,
35
+ hf_token: Optional[str] = None,
36
+ use_streaming: bool = True
37
  ):
38
  self.data_source = data_source
39
  self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
40
  self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "kshitijthakkar/smoltrace-leaderboard")
41
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
42
+ self.use_streaming = use_streaming
43
 
44
  # Cache
45
  self._cache: Dict[str, Any] = {}
 
81
  raise ValueError("No valid data source available")
82
 
83
  def _load_leaderboard_from_hf(self) -> pd.DataFrame:
84
+ """Load leaderboard from HuggingFace dataset with optional streaming"""
85
  try:
86
+ if self.use_streaming:
87
+ print("[INFO] Loading leaderboard with streaming...")
88
+ # Load with streaming for faster initial response
89
+ ds = load_dataset(
90
+ self.leaderboard_dataset,
91
+ split="train",
92
+ token=self.hf_token,
93
+ streaming=True
94
+ )
95
+ # Convert streamed data to list of dicts, then to DataFrame
96
+ data = list(ds)
97
+ df = pd.DataFrame(data)
98
+ print(f"[OK] Streamed leaderboard from HuggingFace: {len(df)} rows")
99
+ else:
100
+ # Traditional full download
101
+ ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
102
+ df = ds.to_pandas()
103
+ print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
104
  return df
105
  except Exception as e:
106
  print(f"[ERROR] Loading from HuggingFace: {e}")
 
159
  raise ValueError("No valid data source available")
160
 
161
  def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
162
+ """Load results from HuggingFace dataset with optional streaming"""
163
+ if self.use_streaming:
164
+ print(f"[INFO] Streaming results from {dataset_id}...")
165
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
166
+ data = list(ds)
167
+ df = pd.DataFrame(data)
168
+ print(f"[OK] Streamed results from HuggingFace: {len(df)} rows")
169
+ else:
170
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
171
+ df = ds.to_pandas()
172
+ print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
173
  return df
174
 
175
  def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
 
227
  raise ValueError("No valid data source available")
228
 
229
  def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
230
+ """Load traces from HuggingFace dataset with optional streaming"""
231
+ if self.use_streaming:
232
+ print(f"[INFO] Streaming traces from {dataset_id}...")
233
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
234
+ traces = list(ds)
235
+ print(f"[OK] Streamed traces from HuggingFace: {len(traces)} traces")
236
+ else:
237
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
238
+ traces = ds.to_pandas().to_dict("records")
239
+ print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
240
  return traces
241
 
242
  def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
 
294
  return pd.DataFrame()
295
 
296
  def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
297
+ """Load metrics from HuggingFace dataset (flat format) with optional streaming"""
298
+ if self.use_streaming:
299
+ print(f"[INFO] Streaming metrics from {dataset_id}...")
300
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token, streaming=True)
301
+ data = list(ds)
302
+ df = pd.DataFrame(data)
303
+ print(f"[OK] Streamed metrics from HuggingFace: {len(df)} rows")
304
+ else:
305
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
306
+ df = ds.to_pandas()
307
+ print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
308
 
309
  # Convert timestamp strings to datetime if needed
310
+ if 'timestamp' in df.columns and not df.empty:
311
  df['timestamp'] = pd.to_datetime(df['timestamp'])
312
 
313
+ if not df.empty:
314
+ print(f" Columns: {list(df.columns)}")
315
  return df
316
 
317
  def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
 
459
  Configured DataLoader instance
460
  """
461
  data_source = os.getenv("DATA_SOURCE", "both")
462
+ use_streaming = os.getenv("USE_STREAMING", "true").lower() == "true"
463
 
464
  return DataLoader(
465
  data_source=data_source,
466
  json_data_path=os.getenv("JSON_DATA_PATH"),
467
  leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
468
+ hf_token=os.getenv("HF_TOKEN"),
469
+ use_streaming=use_streaming
470
  )