Mandark-droid commited on
Commit
fbd2ae8
·
1 Parent(s): 0c0a9f1

fix: Remove all API key parameters from MCP tools - use environment variables only

Browse files

- Removed hf_token and gemini_api_key parameters from all MCP tool functions
- Removed Settings tab from UI (no more API key inputs)
- All SMOLTRACE datasets are public - no HF token required
- Only GEMINI_API_KEY environment variable needed
- Fixes security vulnerability where API keys were exposed in MCP examples

Security Impact: ZERO risk of API key exposure in MCP examples

Files changed (2) hide show
  1. app.py +18 -188
  2. mcp_tools.py +33 -69
app.py CHANGED
@@ -80,151 +80,7 @@ def create_gradio_ui():
80
 
81
  **MCP Endpoint (SSE - Deprecated)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse`
82
  """)
83
-
84
- # Session state for API keys
85
- gemini_key_state = gr.State(value=os.getenv("GEMINI_API_KEY", ""))
86
- hf_token_state = gr.State(value=os.getenv("HF_TOKEN", ""))
87
-
88
  with gr.Tabs():
89
- # Tab 0: Settings (API Keys)
90
- with gr.Tab("⚙️ Settings"):
91
- gr.Markdown("""
92
- ## 🔑 API Key Configuration
93
-
94
- Configure your API keys here. These will override environment variables for this session only.
95
-
96
- **Why configure here?**
97
- - No need to set environment variables
98
- - Test with different API keys easily
99
- - Secure session-only storage (not persisted)
100
-
101
- **Security Note**: API keys are stored in session state only and are not saved permanently.
102
- """)
103
-
104
- with gr.Row():
105
- with gr.Column():
106
- gr.Markdown("### Google Gemini API Key")
107
- gemini_key_input = gr.Textbox(
108
- label="Gemini API Key",
109
- placeholder="Enter your Google Gemini API key",
110
- type="password",
111
- value=os.getenv("GEMINI_API_KEY", ""),
112
- info="Get your key from: https://aistudio.google.com/app/apikey"
113
- )
114
- gemini_status = gr.Markdown("Status: Using environment variable" if os.getenv("GEMINI_API_KEY") else "⚠️ Status: No API key configured")
115
-
116
- with gr.Column():
117
- gr.Markdown("### HuggingFace Token")
118
- hf_token_input = gr.Textbox(
119
- label="HuggingFace Token",
120
- placeholder="Enter your HuggingFace token",
121
- type="password",
122
- value=os.getenv("HF_TOKEN", ""),
123
- info="Get your token from: https://huggingface.co/settings/tokens"
124
- )
125
- hf_status = gr.Markdown("Status: Using environment variable" if os.getenv("HF_TOKEN") else "⚠️ Status: No token configured")
126
-
127
- with gr.Row():
128
- save_keys_button = gr.Button("💾 Save API Keys for This Session", variant="primary", size="lg")
129
- clear_keys_button = gr.Button("🗑️ Clear Session Keys", variant="secondary")
130
-
131
- keys_save_status = gr.Markdown("")
132
-
133
- def save_api_keys(gemini_key, hf_token):
134
- """
135
- Save API keys to session state.
136
-
137
- Args:
138
- gemini_key (str): Google Gemini API key
139
- hf_token (str): HuggingFace token
140
-
141
- Returns:
142
- tuple: Updated state values and status message
143
- """
144
- status_messages = []
145
-
146
- # Validate and save Gemini key
147
- if gemini_key and gemini_key.strip():
148
- try:
149
- # Test the key by creating a client
150
- test_client = GeminiClient(api_key=gemini_key.strip())
151
- gemini_saved = gemini_key.strip()
152
- status_messages.append("✅ Gemini API key validated and saved")
153
- except Exception as e:
154
- gemini_saved = os.getenv("GEMINI_API_KEY", "")
155
- status_messages.append(f"❌ Gemini API key invalid: {str(e)}")
156
- else:
157
- gemini_saved = os.getenv("GEMINI_API_KEY", "")
158
- status_messages.append("ℹ️ Gemini API key cleared (using environment variable if set)")
159
-
160
- # Validate and save HF token
161
- if hf_token and hf_token.strip():
162
- hf_saved = hf_token.strip()
163
- status_messages.append("✅ HuggingFace token saved")
164
- else:
165
- hf_saved = os.getenv("HF_TOKEN", "")
166
- status_messages.append("ℹ️ HuggingFace token cleared (using environment variable if set)")
167
-
168
- status_markdown = "\n\n".join(status_messages)
169
-
170
- return gemini_saved, hf_saved, f"### Save Status\n\n{status_markdown}"
171
-
172
- def clear_api_keys():
173
- """
174
- Clear session API keys and revert to environment variables.
175
-
176
- Returns:
177
- tuple: Cleared state values and status message
178
- """
179
- env_gemini = os.getenv("GEMINI_API_KEY", "")
180
- env_hf = os.getenv("HF_TOKEN", "")
181
-
182
- status = "### Keys Cleared\n\nReverted to environment variables.\n\n"
183
- if env_gemini:
184
- status += "✅ Using GEMINI_API_KEY from environment\n\n"
185
- else:
186
- status += "⚠️ No GEMINI_API_KEY in environment\n\n"
187
-
188
- if env_hf:
189
- status += "✅ Using HF_TOKEN from environment"
190
- else:
191
- status += "⚠️ No HF_TOKEN in environment"
192
-
193
- return env_gemini, env_hf, status
194
-
195
- save_keys_button.click(
196
- fn=save_api_keys,
197
- inputs=[gemini_key_input, hf_token_input],
198
- outputs=[gemini_key_state, hf_token_state, keys_save_status]
199
- )
200
-
201
- clear_keys_button.click(
202
- fn=clear_api_keys,
203
- inputs=[],
204
- outputs=[gemini_key_state, hf_token_state, keys_save_status]
205
- )
206
-
207
- gr.Markdown("""
208
- ---
209
-
210
- ### How It Works
211
-
212
- 1. **Enter your API keys** in the fields above
213
- 2. **Click "Save API Keys"** to validate and store them for this session
214
- 3. **Use any tool** - they will automatically use your configured keys
215
- 4. **Keys are session-only** - they won't be saved when you close the browser
216
-
217
- ### Environment Variables (Alternative)
218
-
219
- You can also set these as environment variables:
220
- ```bash
221
- export GEMINI_API_KEY="your-key-here"
222
- export HF_TOKEN="your-token-here"
223
- ```
224
-
225
- UI-configured keys will always override environment variables.
226
- """)
227
-
228
  # Tab 1: Analyze Leaderboard
229
  with gr.Tab("📊 Analyze Leaderboard"):
230
  gr.Markdown("### Get AI-powered insights from evaluation leaderboard")
@@ -258,7 +114,7 @@ def create_gradio_ui():
258
  with gr.Column():
259
  lb_output = gr.Markdown(label="Analysis Results")
260
 
261
- async def run_analyze_leaderboard(repo, metric, time_range, top_n, gemini_key, hf_token):
262
  """
263
  Analyze agent evaluation leaderboard and generate AI-powered insights.
264
 
@@ -278,16 +134,11 @@ def create_gradio_ui():
278
  str: Markdown-formatted analysis with top performers, trends, and recommendations
279
  """
280
  try:
281
- # Use user-provided key or fall back to environment variable
282
- api_key = gemini_key if gemini_key and gemini_key.strip() else None
283
-
284
  result = await analyze_leaderboard(
285
  leaderboard_repo=repo,
286
  metric_focus=metric,
287
  time_range=time_range,
288
- top_n=int(top_n),
289
- hf_token=hf_token if hf_token and hf_token.strip() else None,
290
- gemini_api_key=api_key
291
  )
292
  return result
293
  except Exception as e:
@@ -295,7 +146,7 @@ def create_gradio_ui():
295
 
296
  lb_button.click(
297
  fn=run_analyze_leaderboard,
298
- inputs=[lb_repo, lb_metric, lb_time, lb_top_n, gemini_key_state, hf_token_state],
299
  outputs=[lb_output]
300
  )
301
 
@@ -325,7 +176,7 @@ def create_gradio_ui():
325
  with gr.Column():
326
  trace_output = gr.Markdown(label="Debug Analysis")
327
 
328
- async def run_debug_trace(trace_id_val, traces_repo_val, question_val, gemini_key, hf_token):
329
  """
330
  Debug a specific agent execution trace using OpenTelemetry data.
331
 
@@ -347,23 +198,17 @@ def create_gradio_ui():
347
  if not trace_id_val or not traces_repo_val:
348
  return "❌ **Error**: Please provide both Trace ID and Traces Repository"
349
 
350
- # Use user-provided key or fall back to environment variable
351
- api_key = gemini_key if gemini_key and gemini_key.strip() else None
352
-
353
  result = await debug_trace(
354
  trace_id=trace_id_val,
355
  traces_repo=traces_repo_val,
356
- question=question_val or "Analyze this trace",
357
- hf_token=hf_token if hf_token and hf_token.strip() else None,
358
- gemini_api_key=api_key
359
- )
360
  return result
361
  except Exception as e:
362
  return f"❌ **Error**: {str(e)}"
363
 
364
  trace_button.click(
365
  fn=run_debug_trace,
366
- inputs=[trace_id, traces_repo, question, gemini_key_state, hf_token_state],
367
  outputs=[trace_output]
368
  )
369
 
@@ -401,7 +246,7 @@ def create_gradio_ui():
401
  with gr.Column():
402
  cost_output = gr.Markdown(label="Cost Estimate")
403
 
404
- async def run_estimate_cost(model, agent_type, num_tests, hardware, gemini_key):
405
  """
406
  Estimate the cost, duration, and CO2 emissions of running agent evaluations.
407
 
@@ -423,15 +268,11 @@ def create_gradio_ui():
423
  if not model:
424
  return "❌ **Error**: Please provide a model name"
425
 
426
- # Use user-provided key or fall back to environment variable
427
- api_key = gemini_key if gemini_key and gemini_key.strip() else None
428
-
429
  result = await estimate_cost(
430
  model=model,
431
  agent_type=agent_type,
432
  num_tests=int(num_tests),
433
- hardware=hardware,
434
- gemini_api_key=api_key
435
  )
436
  return result
437
  except Exception as e:
@@ -439,7 +280,7 @@ def create_gradio_ui():
439
 
440
  cost_button.click(
441
  fn=run_estimate_cost,
442
- inputs=[cost_model, cost_agent_type, cost_num_tests, cost_hardware, gemini_key_state],
443
  outputs=[cost_output]
444
  )
445
 
@@ -482,7 +323,7 @@ def create_gradio_ui():
482
  compare_button = gr.Button("🔍 Compare Runs", variant="primary")
483
  compare_output = gr.Markdown()
484
 
485
- async def run_compare_runs(run_id_1, run_id_2, focus, repo, gemini_key, hf_token):
486
  """
487
  Compare two evaluation runs and generate AI-powered comparative analysis.
488
 
@@ -502,16 +343,11 @@ def create_gradio_ui():
502
  str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
503
  """
504
  try:
505
- # Use user-provided key or fall back to environment variable
506
- api_key = gemini_key if gemini_key and gemini_key.strip() else None
507
-
508
  result = await compare_runs(
509
  run_id_1=run_id_1,
510
  run_id_2=run_id_2,
511
  leaderboard_repo=repo,
512
- comparison_focus=focus,
513
- hf_token=hf_token if hf_token and hf_token.strip() else None,
514
- gemini_api_key=api_key
515
  )
516
  return result
517
  except Exception as e:
@@ -519,7 +355,7 @@ def create_gradio_ui():
519
 
520
  compare_button.click(
521
  fn=run_compare_runs,
522
- inputs=[compare_run_id_1, compare_run_id_2, compare_focus, compare_repo, gemini_key_state, hf_token_state],
523
  outputs=[compare_output]
524
  )
525
 
@@ -558,7 +394,7 @@ def create_gradio_ui():
558
  results_button = gr.Button("🔍 Analyze Results", variant="primary")
559
  results_output = gr.Markdown()
560
 
561
- async def run_analyze_results(repo, focus, max_rows, gemini_key, hf_token):
562
  """
563
  Analyze detailed test results and provide optimization recommendations.
564
 
@@ -576,15 +412,10 @@ def create_gradio_ui():
576
  if not repo:
577
  return "❌ **Error**: Please provide a results repository"
578
 
579
- # Use user-provided key or fall back to environment variable
580
- api_key = gemini_key if gemini_key and gemini_key.strip() else None
581
-
582
  result = await analyze_results(
583
  results_repo=repo,
584
  analysis_focus=focus,
585
- max_rows=int(max_rows),
586
- hf_token=hf_token if hf_token and hf_token.strip() else None,
587
- gemini_api_key=api_key
588
  )
589
  return result
590
  except Exception as e:
@@ -592,7 +423,7 @@ def create_gradio_ui():
592
 
593
  results_button.click(
594
  fn=run_analyze_results,
595
- inputs=[results_repo_input, results_focus, results_max_rows, gemini_key_state, hf_token_state],
596
  outputs=[results_output]
597
  )
598
 
@@ -629,7 +460,7 @@ def create_gradio_ui():
629
  dataset_button = gr.Button("📥 Load Dataset", variant="primary")
630
  dataset_output = gr.JSON(label="Dataset JSON Output")
631
 
632
- async def run_get_dataset(repo, max_rows, hf_token):
633
  """
634
  Load SMOLTRACE datasets from HuggingFace and return as JSON.
635
 
@@ -649,8 +480,7 @@ def create_gradio_ui():
649
  import json
650
  result = await get_dataset(
651
  dataset_repo=repo,
652
- max_rows=int(max_rows),
653
- hf_token=hf_token if hf_token and hf_token.strip() else None
654
  )
655
  # Parse JSON string back to dict for JSON component
656
  return json.loads(result)
@@ -659,7 +489,7 @@ def create_gradio_ui():
659
 
660
  dataset_button.click(
661
  fn=run_get_dataset,
662
- inputs=[dataset_repo_input, dataset_max_rows, hf_token_state],
663
  outputs=[dataset_output]
664
  )
665
 
 
80
 
81
  **MCP Endpoint (SSE - Deprecated)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse`
82
  """)
 
 
 
 
 
83
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Tab 1: Analyze Leaderboard
85
  with gr.Tab("📊 Analyze Leaderboard"):
86
  gr.Markdown("### Get AI-powered insights from evaluation leaderboard")
 
114
  with gr.Column():
115
  lb_output = gr.Markdown(label="Analysis Results")
116
 
117
+ async def run_analyze_leaderboard(repo, metric, time_range, top_n):
118
  """
119
  Analyze agent evaluation leaderboard and generate AI-powered insights.
120
 
 
134
  str: Markdown-formatted analysis with top performers, trends, and recommendations
135
  """
136
  try:
 
 
 
137
  result = await analyze_leaderboard(
138
  leaderboard_repo=repo,
139
  metric_focus=metric,
140
  time_range=time_range,
141
+ top_n=int(top_n)
 
 
142
  )
143
  return result
144
  except Exception as e:
 
146
 
147
  lb_button.click(
148
  fn=run_analyze_leaderboard,
149
+ inputs=[lb_repo, lb_metric, lb_time, lb_top_n],
150
  outputs=[lb_output]
151
  )
152
 
 
176
  with gr.Column():
177
  trace_output = gr.Markdown(label="Debug Analysis")
178
 
179
+ async def run_debug_trace(trace_id_val, traces_repo_val, question_val):
180
  """
181
  Debug a specific agent execution trace using OpenTelemetry data.
182
 
 
198
  if not trace_id_val or not traces_repo_val:
199
  return "❌ **Error**: Please provide both Trace ID and Traces Repository"
200
 
 
 
 
201
  result = await debug_trace(
202
  trace_id=trace_id_val,
203
  traces_repo=traces_repo_val,
204
+ question=question_val or "Analyze this trace")
 
 
 
205
  return result
206
  except Exception as e:
207
  return f"❌ **Error**: {str(e)}"
208
 
209
  trace_button.click(
210
  fn=run_debug_trace,
211
+ inputs=[trace_id, traces_repo, question],
212
  outputs=[trace_output]
213
  )
214
 
 
246
  with gr.Column():
247
  cost_output = gr.Markdown(label="Cost Estimate")
248
 
249
+ async def run_estimate_cost(model, agent_type, num_tests, hardware):
250
  """
251
  Estimate the cost, duration, and CO2 emissions of running agent evaluations.
252
 
 
268
  if not model:
269
  return "❌ **Error**: Please provide a model name"
270
 
 
 
 
271
  result = await estimate_cost(
272
  model=model,
273
  agent_type=agent_type,
274
  num_tests=int(num_tests),
275
+ hardware=hardware
 
276
  )
277
  return result
278
  except Exception as e:
 
280
 
281
  cost_button.click(
282
  fn=run_estimate_cost,
283
+ inputs=[cost_model, cost_agent_type, cost_num_tests, cost_hardware],
284
  outputs=[cost_output]
285
  )
286
 
 
323
  compare_button = gr.Button("🔍 Compare Runs", variant="primary")
324
  compare_output = gr.Markdown()
325
 
326
+ async def run_compare_runs(run_id_1, run_id_2, focus, repo):
327
  """
328
  Compare two evaluation runs and generate AI-powered comparative analysis.
329
 
 
343
  str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
344
  """
345
  try:
 
 
 
346
  result = await compare_runs(
347
  run_id_1=run_id_1,
348
  run_id_2=run_id_2,
349
  leaderboard_repo=repo,
350
+ comparison_focus=focus
 
 
351
  )
352
  return result
353
  except Exception as e:
 
355
 
356
  compare_button.click(
357
  fn=run_compare_runs,
358
+ inputs=[compare_run_id_1, compare_run_id_2, compare_focus, compare_repo],
359
  outputs=[compare_output]
360
  )
361
 
 
394
  results_button = gr.Button("🔍 Analyze Results", variant="primary")
395
  results_output = gr.Markdown()
396
 
397
+ async def run_analyze_results(repo, focus, max_rows):
398
  """
399
  Analyze detailed test results and provide optimization recommendations.
400
 
 
412
  if not repo:
413
  return "❌ **Error**: Please provide a results repository"
414
 
 
 
 
415
  result = await analyze_results(
416
  results_repo=repo,
417
  analysis_focus=focus,
418
+ max_rows=int(max_rows)
 
 
419
  )
420
  return result
421
  except Exception as e:
 
423
 
424
  results_button.click(
425
  fn=run_analyze_results,
426
+ inputs=[results_repo_input, results_focus, results_max_rows],
427
  outputs=[results_output]
428
  )
429
 
 
460
  dataset_button = gr.Button("📥 Load Dataset", variant="primary")
461
  dataset_output = gr.JSON(label="Dataset JSON Output")
462
 
463
+ async def run_get_dataset(repo, max_rows):
464
  """
465
  Load SMOLTRACE datasets from HuggingFace and return as JSON.
466
 
 
480
  import json
481
  result = await get_dataset(
482
  dataset_repo=repo,
483
+ max_rows=int(max_rows)
 
484
  )
485
  # Parse JSON string back to dict for JSON component
486
  return json.loads(result)
 
489
 
490
  dataset_button.click(
491
  fn=run_get_dataset,
492
+ inputs=[dataset_repo_input, dataset_max_rows],
493
  outputs=[dataset_output]
494
  )
495
 
mcp_tools.py CHANGED
@@ -27,9 +27,7 @@ async def analyze_leaderboard(
27
  leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
28
  metric_focus: str = "overall",
29
  time_range: str = "last_week",
30
- top_n: int = 5,
31
- hf_token: Optional[str] = None,
32
- gemini_api_key: Optional[str] = None
33
  ) -> str:
34
  """
35
  Answer questions about the leaderboard with AI-powered analysis and insights.
@@ -47,26 +45,25 @@ async def analyze_leaderboard(
47
  agent evaluation results, including top performers, trends, cost/performance
48
  trade-offs, and actionable recommendations.
49
 
 
 
 
50
  Args:
51
  leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
52
  metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
53
  time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
54
  top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
55
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
56
- gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
57
 
58
  Returns:
59
  str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
60
  """
61
  try:
62
- # Initialize Gemini client with provided key or from environment
63
- gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
64
- # Load leaderboard data from HuggingFace
65
  print(f"Loading leaderboard from {leaderboard_repo}...")
66
 
67
- # Use user-provided token or fall back to environment variable
68
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
69
- ds = load_dataset(leaderboard_repo, split="train", token=token)
70
  df = pd.DataFrame(ds)
71
 
72
  # Filter by time range
@@ -135,9 +132,7 @@ async def analyze_leaderboard(
135
  async def debug_trace(
136
  trace_id: str,
137
  traces_repo: str,
138
- question: str = "Analyze this trace and explain what happened",
139
- hf_token: Optional[str] = None,
140
- gemini_api_key: Optional[str] = None
141
  ) -> str:
142
  """
143
  Answer questions about agent traces with AI-powered debugging and analysis.
@@ -159,21 +154,16 @@ async def debug_trace(
159
  trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
160
  traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
161
  question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
162
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
163
- gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
164
-
165
  Returns:
166
  str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
167
  """
168
  try:
169
  # Initialize Gemini client with provided key or from environment
170
- gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
171
- # Load traces dataset
172
  print(f"Loading traces from {traces_repo}...")
173
 
174
- # Use user-provided token or fall back to environment variable
175
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
176
- ds = load_dataset(traces_repo, split="train", token=token)
177
  df = pd.DataFrame(ds)
178
 
179
  # Find the specific trace
@@ -243,8 +233,7 @@ async def estimate_cost(
243
  model: str,
244
  agent_type: str,
245
  num_tests: int = 100,
246
- hardware: str = "auto",
247
- gemini_api_key: Optional[str] = None
248
  ) -> str:
249
  """
250
  Answer questions about evaluation costs with AI-powered estimates and recommendations.
@@ -267,14 +256,12 @@ async def estimate_cost(
267
  agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
268
  num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
269
  hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
270
- gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
271
-
272
  Returns:
273
  str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
274
  """
275
  try:
276
  # Initialize Gemini client with provided key or from environment
277
- gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
278
  # Determine if API or local model
279
  is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
280
 
@@ -378,9 +365,7 @@ async def compare_runs(
378
  run_id_1: str,
379
  run_id_2: str,
380
  leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
381
- comparison_focus: str = "comprehensive",
382
- hf_token: Optional[str] = None,
383
- gemini_api_key: Optional[str] = None
384
  ) -> str:
385
  """
386
  Compare two evaluation runs and generate AI-powered comparative analysis.
@@ -394,19 +379,13 @@ async def compare_runs(
394
  run_id_2 (str): Second run ID to compare
395
  leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
396
  comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
397
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
398
- gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
399
-
400
  Returns:
401
  str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
402
  """
403
  try:
404
  # Initialize Gemini client with provided key or from environment
405
- gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
406
- # Load leaderboard data
407
- # Use user-provided token or fall back to environment variable
408
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
409
- dataset = load_dataset(leaderboard_repo, split="train", token=token)
410
  df = pd.DataFrame(dataset)
411
 
412
  # Find the two runs
@@ -580,9 +559,7 @@ Provide eco-conscious recommendations for sustainable AI deployment.
580
  async def analyze_results(
581
  results_repo: str,
582
  analysis_focus: str = "comprehensive",
583
- max_rows: int = 100,
584
- hf_token: Optional[str] = None,
585
- gemini_api_key: Optional[str] = None
586
  ) -> str:
587
  """
588
  Analyze detailed test results and provide optimization recommendations.
@@ -601,20 +578,17 @@ async def analyze_results(
601
  results_repo (str): HuggingFace dataset repository containing results (e.g., "username/smoltrace-results-gpt4-20251114")
602
  analysis_focus (str): Focus area. Options: "failures", "performance", "cost", "comprehensive". Default: "comprehensive"
603
  max_rows (int): Maximum test cases to analyze. Default: 100. Range: 10-500
604
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
605
- gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
606
-
607
  Returns:
608
  str: Markdown-formatted analysis with failure patterns, performance insights, cost analysis, and optimization recommendations
609
  """
610
  try:
611
  # Initialize Gemini client
612
- gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
613
 
614
  # Load results dataset
615
  print(f"Loading results from {results_repo}...")
616
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
617
- ds = load_dataset(results_repo, split="train", token=token)
618
  df = pd.DataFrame(ds)
619
 
620
  if df.empty:
@@ -727,8 +701,7 @@ async def analyze_results(
727
  @gr.mcp.tool()
728
  async def get_dataset(
729
  dataset_repo: str,
730
- max_rows: int = 50,
731
- hf_token: Optional[str] = None
732
  ) -> str:
733
  """
734
  Load SMOLTRACE datasets from HuggingFace and return as JSON.
@@ -748,8 +721,6 @@ async def get_dataset(
748
  Args:
749
  dataset_repo (str): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard")
750
  max_rows (int): Maximum number of rows to return. Default: 50. Range: 1-200
751
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
752
-
753
  Returns:
754
  str: JSON object with dataset data and metadata
755
  """
@@ -762,10 +733,7 @@ async def get_dataset(
762
  "data": []
763
  }, indent=2, default=str)
764
 
765
- # Load dataset from HuggingFace
766
- # Use user-provided token or fall back to environment variable
767
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
768
- dataset = load_dataset(dataset_repo, split="train", token=token)
769
  df = pd.DataFrame(dataset)
770
 
771
  if df.empty:
@@ -815,7 +783,7 @@ async def get_dataset(
815
  # ============================================================================
816
 
817
  @gr.mcp.resource("leaderboard://{repo}")
818
- def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard", hf_token: Optional[str] = None) -> str:
819
  """
820
  [RAW DATA ONLY] Get raw leaderboard data in JSON format - NO analysis or insights.
821
 
@@ -829,17 +797,15 @@ def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard", hf_
829
 
830
  For questions, insights, recommendations, or analysis → use analyze_leaderboard tool instead!
831
 
 
 
832
  Args:
833
  repo (str): HuggingFace dataset repository name. Default: "kshitijthakkar/smoltrace-leaderboard"
834
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
835
-
836
  Returns:
837
  str: Raw JSON string containing all evaluation runs without any analysis
838
  """
839
- try:
840
- # Use user-provided token or fall back to environment variable
841
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
842
- ds = load_dataset(repo, split="train", token=token)
843
  df = pd.DataFrame(ds)
844
 
845
  # Convert to JSON with proper formatting
@@ -858,7 +824,7 @@ def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard", hf_
858
 
859
 
860
  @gr.mcp.resource("trace://{trace_id}/{repo}")
861
- def get_trace_data(trace_id: str, repo: str, hf_token: Optional[str] = None) -> str:
862
  """
863
  [RAW DATA ONLY] Get raw OpenTelemetry trace data in JSON format - NO analysis.
864
 
@@ -872,18 +838,16 @@ def get_trace_data(trace_id: str, repo: str, hf_token: Optional[str] = None) ->
872
 
873
  For debugging, questions, or analysis → use debug_trace tool instead!
874
 
 
 
875
  Args:
876
  trace_id (str): Unique identifier for the trace (e.g., "trace_abc123")
877
  repo (str): HuggingFace dataset repository containing traces (e.g., "username/agent-traces-model")
878
- hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
879
-
880
  Returns:
881
  str: Raw JSON string containing OpenTelemetry spans without any analysis
882
  """
883
- try:
884
- # Use user-provided token or fall back to environment variable
885
- token = hf_token if hf_token else os.getenv("HF_TOKEN")
886
- ds = load_dataset(repo, split="train", token=token)
887
  df = pd.DataFrame(ds)
888
 
889
  # Find specific trace
 
27
  leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
28
  metric_focus: str = "overall",
29
  time_range: str = "last_week",
30
+ top_n: int = 5
 
 
31
  ) -> str:
32
  """
33
  Answer questions about the leaderboard with AI-powered analysis and insights.
 
45
  agent evaluation results, including top performers, trends, cost/performance
46
  trade-offs, and actionable recommendations.
47
 
48
+ **Security**: Requires GEMINI_API_KEY environment variable.
49
+ **Note**: All SMOLTRACE datasets are public - no HF token required.
50
+
51
  Args:
52
  leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
53
  metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
54
  time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
55
  top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
 
 
56
 
57
  Returns:
58
  str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
59
  """
60
  try:
61
+ # Initialize Gemini client from environment variable only
62
+ gemini_client = GeminiClient()
63
+ # Load leaderboard data from HuggingFace (public dataset)
64
  print(f"Loading leaderboard from {leaderboard_repo}...")
65
 
66
+ ds = load_dataset(leaderboard_repo, split="train")
 
 
67
  df = pd.DataFrame(ds)
68
 
69
  # Filter by time range
 
132
  async def debug_trace(
133
  trace_id: str,
134
  traces_repo: str,
135
+ question: str = "Analyze this trace and explain what happened"
 
 
136
  ) -> str:
137
  """
138
  Answer questions about agent traces with AI-powered debugging and analysis.
 
154
  trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
155
  traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
156
  question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
 
 
 
157
  Returns:
158
  str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
159
  """
160
  try:
161
  # Initialize Gemini client with provided key or from environment
162
+ gemini_client = GeminiClient()
163
+ # Load traces dataset (public dataset)
164
  print(f"Loading traces from {traces_repo}...")
165
 
166
+ ds = load_dataset(traces_repo, split="train")
 
 
167
  df = pd.DataFrame(ds)
168
 
169
  # Find the specific trace
 
233
  model: str,
234
  agent_type: str,
235
  num_tests: int = 100,
236
+ hardware: str = "auto"
 
237
  ) -> str:
238
  """
239
  Answer questions about evaluation costs with AI-powered estimates and recommendations.
 
256
  agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
257
  num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
258
  hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
 
 
259
  Returns:
260
  str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
261
  """
262
  try:
263
  # Initialize Gemini client with provided key or from environment
264
+ gemini_client = GeminiClient()
265
  # Determine if API or local model
266
  is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
267
 
 
365
  run_id_1: str,
366
  run_id_2: str,
367
  leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
368
+ comparison_focus: str = "comprehensive"
 
 
369
  ) -> str:
370
  """
371
  Compare two evaluation runs and generate AI-powered comparative analysis.
 
379
  run_id_2 (str): Second run ID to compare
380
  leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
381
  comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
 
 
 
382
  Returns:
383
  str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
384
  """
385
  try:
386
  # Initialize Gemini client with provided key or from environment
387
+ gemini_client = GeminiClient()
388
+ # Load leaderboard data dataset = load_dataset(leaderboard_repo, split="train")
 
 
 
389
  df = pd.DataFrame(dataset)
390
 
391
  # Find the two runs
 
559
  async def analyze_results(
560
  results_repo: str,
561
  analysis_focus: str = "comprehensive",
562
+ max_rows: int = 100
 
 
563
  ) -> str:
564
  """
565
  Analyze detailed test results and provide optimization recommendations.
 
578
  results_repo (str): HuggingFace dataset repository containing results (e.g., "username/smoltrace-results-gpt4-20251114")
579
  analysis_focus (str): Focus area. Options: "failures", "performance", "cost", "comprehensive". Default: "comprehensive"
580
  max_rows (int): Maximum test cases to analyze. Default: 100. Range: 10-500
 
 
 
581
  Returns:
582
  str: Markdown-formatted analysis with failure patterns, performance insights, cost analysis, and optimization recommendations
583
  """
584
  try:
585
  # Initialize Gemini client
586
+ gemini_client = GeminiClient()
587
 
588
  # Load results dataset
589
  print(f"Loading results from {results_repo}...")
590
+
591
+ ds = load_dataset(results_repo, split="train")
592
  df = pd.DataFrame(ds)
593
 
594
  if df.empty:
 
701
  @gr.mcp.tool()
702
  async def get_dataset(
703
  dataset_repo: str,
704
+ max_rows: int = 50
 
705
  ) -> str:
706
  """
707
  Load SMOLTRACE datasets from HuggingFace and return as JSON.
 
721
  Args:
722
  dataset_repo (str): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard")
723
  max_rows (int): Maximum number of rows to return. Default: 50. Range: 1-200
 
 
724
  Returns:
725
  str: JSON object with dataset data and metadata
726
  """
 
733
  "data": []
734
  }, indent=2, default=str)
735
 
736
+ # Load dataset from HuggingFace dataset = load_dataset(dataset_repo, split="train")
 
 
 
737
  df = pd.DataFrame(dataset)
738
 
739
  if df.empty:
 
783
  # ============================================================================
784
 
785
  @gr.mcp.resource("leaderboard://{repo}")
786
+ def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard") -> str:
787
  """
788
  [RAW DATA ONLY] Get raw leaderboard data in JSON format - NO analysis or insights.
789
 
 
797
 
798
  For questions, insights, recommendations, or analysis → use analyze_leaderboard tool instead!
799
 
800
+ **Note**: All SMOLTRACE datasets are public - no authentication required.
801
+
802
  Args:
803
  repo (str): HuggingFace dataset repository name. Default: "kshitijthakkar/smoltrace-leaderboard"
 
 
804
  Returns:
805
  str: Raw JSON string containing all evaluation runs without any analysis
806
  """
807
+ try:
808
+ ds = load_dataset(repo, split="train")
 
 
809
  df = pd.DataFrame(ds)
810
 
811
  # Convert to JSON with proper formatting
 
824
 
825
 
826
  @gr.mcp.resource("trace://{trace_id}/{repo}")
827
+ def get_trace_data(trace_id: str, repo: str) -> str:
828
  """
829
  [RAW DATA ONLY] Get raw OpenTelemetry trace data in JSON format - NO analysis.
830
 
 
838
 
839
  For debugging, questions, or analysis → use debug_trace tool instead!
840
 
841
+ **Note**: All SMOLTRACE datasets are public - no authentication required.
842
+
843
  Args:
844
  trace_id (str): Unique identifier for the trace (e.g., "trace_abc123")
845
  repo (str): HuggingFace dataset repository containing traces (e.g., "username/agent-traces-model")
 
 
846
  Returns:
847
  str: Raw JSON string containing OpenTelemetry spans without any analysis
848
  """
849
+ try:
850
+ ds = load_dataset(repo, split="train")
 
 
851
  df = pd.DataFrame(ds)
852
 
853
  # Find specific trace