Mandark-droid
commited on
Commit
·
fbd2ae8
1
Parent(s):
0c0a9f1
fix: Remove all API key parameters from MCP tools - use environment variables only
Browse files- Removed hf_token and gemini_api_key parameters from all MCP tool functions
- Removed Settings tab from UI (no more API key inputs)
- All SMOLTRACE datasets are public - no HF token required
- Only GEMINI_API_KEY environment variable needed
- Fixes security vulnerability where API keys were exposed in MCP examples
Security Impact: ZERO risk of API key exposure in MCP examples
- app.py +18 -188
- mcp_tools.py +33 -69
app.py
CHANGED
|
@@ -80,151 +80,7 @@ def create_gradio_ui():
|
|
| 80 |
|
| 81 |
**MCP Endpoint (SSE - Deprecated)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse`
|
| 82 |
""")
|
| 83 |
-
|
| 84 |
-
# Session state for API keys
|
| 85 |
-
gemini_key_state = gr.State(value=os.getenv("GEMINI_API_KEY", ""))
|
| 86 |
-
hf_token_state = gr.State(value=os.getenv("HF_TOKEN", ""))
|
| 87 |
-
|
| 88 |
with gr.Tabs():
|
| 89 |
-
# Tab 0: Settings (API Keys)
|
| 90 |
-
with gr.Tab("⚙️ Settings"):
|
| 91 |
-
gr.Markdown("""
|
| 92 |
-
## 🔑 API Key Configuration
|
| 93 |
-
|
| 94 |
-
Configure your API keys here. These will override environment variables for this session only.
|
| 95 |
-
|
| 96 |
-
**Why configure here?**
|
| 97 |
-
- No need to set environment variables
|
| 98 |
-
- Test with different API keys easily
|
| 99 |
-
- Secure session-only storage (not persisted)
|
| 100 |
-
|
| 101 |
-
**Security Note**: API keys are stored in session state only and are not saved permanently.
|
| 102 |
-
""")
|
| 103 |
-
|
| 104 |
-
with gr.Row():
|
| 105 |
-
with gr.Column():
|
| 106 |
-
gr.Markdown("### Google Gemini API Key")
|
| 107 |
-
gemini_key_input = gr.Textbox(
|
| 108 |
-
label="Gemini API Key",
|
| 109 |
-
placeholder="Enter your Google Gemini API key",
|
| 110 |
-
type="password",
|
| 111 |
-
value=os.getenv("GEMINI_API_KEY", ""),
|
| 112 |
-
info="Get your key from: https://aistudio.google.com/app/apikey"
|
| 113 |
-
)
|
| 114 |
-
gemini_status = gr.Markdown("Status: Using environment variable" if os.getenv("GEMINI_API_KEY") else "⚠️ Status: No API key configured")
|
| 115 |
-
|
| 116 |
-
with gr.Column():
|
| 117 |
-
gr.Markdown("### HuggingFace Token")
|
| 118 |
-
hf_token_input = gr.Textbox(
|
| 119 |
-
label="HuggingFace Token",
|
| 120 |
-
placeholder="Enter your HuggingFace token",
|
| 121 |
-
type="password",
|
| 122 |
-
value=os.getenv("HF_TOKEN", ""),
|
| 123 |
-
info="Get your token from: https://huggingface.co/settings/tokens"
|
| 124 |
-
)
|
| 125 |
-
hf_status = gr.Markdown("Status: Using environment variable" if os.getenv("HF_TOKEN") else "⚠️ Status: No token configured")
|
| 126 |
-
|
| 127 |
-
with gr.Row():
|
| 128 |
-
save_keys_button = gr.Button("💾 Save API Keys for This Session", variant="primary", size="lg")
|
| 129 |
-
clear_keys_button = gr.Button("🗑️ Clear Session Keys", variant="secondary")
|
| 130 |
-
|
| 131 |
-
keys_save_status = gr.Markdown("")
|
| 132 |
-
|
| 133 |
-
def save_api_keys(gemini_key, hf_token):
|
| 134 |
-
"""
|
| 135 |
-
Save API keys to session state.
|
| 136 |
-
|
| 137 |
-
Args:
|
| 138 |
-
gemini_key (str): Google Gemini API key
|
| 139 |
-
hf_token (str): HuggingFace token
|
| 140 |
-
|
| 141 |
-
Returns:
|
| 142 |
-
tuple: Updated state values and status message
|
| 143 |
-
"""
|
| 144 |
-
status_messages = []
|
| 145 |
-
|
| 146 |
-
# Validate and save Gemini key
|
| 147 |
-
if gemini_key and gemini_key.strip():
|
| 148 |
-
try:
|
| 149 |
-
# Test the key by creating a client
|
| 150 |
-
test_client = GeminiClient(api_key=gemini_key.strip())
|
| 151 |
-
gemini_saved = gemini_key.strip()
|
| 152 |
-
status_messages.append("✅ Gemini API key validated and saved")
|
| 153 |
-
except Exception as e:
|
| 154 |
-
gemini_saved = os.getenv("GEMINI_API_KEY", "")
|
| 155 |
-
status_messages.append(f"❌ Gemini API key invalid: {str(e)}")
|
| 156 |
-
else:
|
| 157 |
-
gemini_saved = os.getenv("GEMINI_API_KEY", "")
|
| 158 |
-
status_messages.append("ℹ️ Gemini API key cleared (using environment variable if set)")
|
| 159 |
-
|
| 160 |
-
# Validate and save HF token
|
| 161 |
-
if hf_token and hf_token.strip():
|
| 162 |
-
hf_saved = hf_token.strip()
|
| 163 |
-
status_messages.append("✅ HuggingFace token saved")
|
| 164 |
-
else:
|
| 165 |
-
hf_saved = os.getenv("HF_TOKEN", "")
|
| 166 |
-
status_messages.append("ℹ️ HuggingFace token cleared (using environment variable if set)")
|
| 167 |
-
|
| 168 |
-
status_markdown = "\n\n".join(status_messages)
|
| 169 |
-
|
| 170 |
-
return gemini_saved, hf_saved, f"### Save Status\n\n{status_markdown}"
|
| 171 |
-
|
| 172 |
-
def clear_api_keys():
|
| 173 |
-
"""
|
| 174 |
-
Clear session API keys and revert to environment variables.
|
| 175 |
-
|
| 176 |
-
Returns:
|
| 177 |
-
tuple: Cleared state values and status message
|
| 178 |
-
"""
|
| 179 |
-
env_gemini = os.getenv("GEMINI_API_KEY", "")
|
| 180 |
-
env_hf = os.getenv("HF_TOKEN", "")
|
| 181 |
-
|
| 182 |
-
status = "### Keys Cleared\n\nReverted to environment variables.\n\n"
|
| 183 |
-
if env_gemini:
|
| 184 |
-
status += "✅ Using GEMINI_API_KEY from environment\n\n"
|
| 185 |
-
else:
|
| 186 |
-
status += "⚠️ No GEMINI_API_KEY in environment\n\n"
|
| 187 |
-
|
| 188 |
-
if env_hf:
|
| 189 |
-
status += "✅ Using HF_TOKEN from environment"
|
| 190 |
-
else:
|
| 191 |
-
status += "⚠️ No HF_TOKEN in environment"
|
| 192 |
-
|
| 193 |
-
return env_gemini, env_hf, status
|
| 194 |
-
|
| 195 |
-
save_keys_button.click(
|
| 196 |
-
fn=save_api_keys,
|
| 197 |
-
inputs=[gemini_key_input, hf_token_input],
|
| 198 |
-
outputs=[gemini_key_state, hf_token_state, keys_save_status]
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
clear_keys_button.click(
|
| 202 |
-
fn=clear_api_keys,
|
| 203 |
-
inputs=[],
|
| 204 |
-
outputs=[gemini_key_state, hf_token_state, keys_save_status]
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
gr.Markdown("""
|
| 208 |
-
---
|
| 209 |
-
|
| 210 |
-
### How It Works
|
| 211 |
-
|
| 212 |
-
1. **Enter your API keys** in the fields above
|
| 213 |
-
2. **Click "Save API Keys"** to validate and store them for this session
|
| 214 |
-
3. **Use any tool** - they will automatically use your configured keys
|
| 215 |
-
4. **Keys are session-only** - they won't be saved when you close the browser
|
| 216 |
-
|
| 217 |
-
### Environment Variables (Alternative)
|
| 218 |
-
|
| 219 |
-
You can also set these as environment variables:
|
| 220 |
-
```bash
|
| 221 |
-
export GEMINI_API_KEY="your-key-here"
|
| 222 |
-
export HF_TOKEN="your-token-here"
|
| 223 |
-
```
|
| 224 |
-
|
| 225 |
-
UI-configured keys will always override environment variables.
|
| 226 |
-
""")
|
| 227 |
-
|
| 228 |
# Tab 1: Analyze Leaderboard
|
| 229 |
with gr.Tab("📊 Analyze Leaderboard"):
|
| 230 |
gr.Markdown("### Get AI-powered insights from evaluation leaderboard")
|
|
@@ -258,7 +114,7 @@ def create_gradio_ui():
|
|
| 258 |
with gr.Column():
|
| 259 |
lb_output = gr.Markdown(label="Analysis Results")
|
| 260 |
|
| 261 |
-
async def run_analyze_leaderboard(repo, metric, time_range, top_n
|
| 262 |
"""
|
| 263 |
Analyze agent evaluation leaderboard and generate AI-powered insights.
|
| 264 |
|
|
@@ -278,16 +134,11 @@ def create_gradio_ui():
|
|
| 278 |
str: Markdown-formatted analysis with top performers, trends, and recommendations
|
| 279 |
"""
|
| 280 |
try:
|
| 281 |
-
# Use user-provided key or fall back to environment variable
|
| 282 |
-
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
| 283 |
-
|
| 284 |
result = await analyze_leaderboard(
|
| 285 |
leaderboard_repo=repo,
|
| 286 |
metric_focus=metric,
|
| 287 |
time_range=time_range,
|
| 288 |
-
top_n=int(top_n)
|
| 289 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 290 |
-
gemini_api_key=api_key
|
| 291 |
)
|
| 292 |
return result
|
| 293 |
except Exception as e:
|
|
@@ -295,7 +146,7 @@ def create_gradio_ui():
|
|
| 295 |
|
| 296 |
lb_button.click(
|
| 297 |
fn=run_analyze_leaderboard,
|
| 298 |
-
inputs=[lb_repo, lb_metric, lb_time, lb_top_n
|
| 299 |
outputs=[lb_output]
|
| 300 |
)
|
| 301 |
|
|
@@ -325,7 +176,7 @@ def create_gradio_ui():
|
|
| 325 |
with gr.Column():
|
| 326 |
trace_output = gr.Markdown(label="Debug Analysis")
|
| 327 |
|
| 328 |
-
async def run_debug_trace(trace_id_val, traces_repo_val, question_val
|
| 329 |
"""
|
| 330 |
Debug a specific agent execution trace using OpenTelemetry data.
|
| 331 |
|
|
@@ -347,23 +198,17 @@ def create_gradio_ui():
|
|
| 347 |
if not trace_id_val or not traces_repo_val:
|
| 348 |
return "❌ **Error**: Please provide both Trace ID and Traces Repository"
|
| 349 |
|
| 350 |
-
# Use user-provided key or fall back to environment variable
|
| 351 |
-
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
| 352 |
-
|
| 353 |
result = await debug_trace(
|
| 354 |
trace_id=trace_id_val,
|
| 355 |
traces_repo=traces_repo_val,
|
| 356 |
-
question=question_val or "Analyze this trace"
|
| 357 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 358 |
-
gemini_api_key=api_key
|
| 359 |
-
)
|
| 360 |
return result
|
| 361 |
except Exception as e:
|
| 362 |
return f"❌ **Error**: {str(e)}"
|
| 363 |
|
| 364 |
trace_button.click(
|
| 365 |
fn=run_debug_trace,
|
| 366 |
-
inputs=[trace_id, traces_repo, question
|
| 367 |
outputs=[trace_output]
|
| 368 |
)
|
| 369 |
|
|
@@ -401,7 +246,7 @@ def create_gradio_ui():
|
|
| 401 |
with gr.Column():
|
| 402 |
cost_output = gr.Markdown(label="Cost Estimate")
|
| 403 |
|
| 404 |
-
async def run_estimate_cost(model, agent_type, num_tests, hardware
|
| 405 |
"""
|
| 406 |
Estimate the cost, duration, and CO2 emissions of running agent evaluations.
|
| 407 |
|
|
@@ -423,15 +268,11 @@ def create_gradio_ui():
|
|
| 423 |
if not model:
|
| 424 |
return "❌ **Error**: Please provide a model name"
|
| 425 |
|
| 426 |
-
# Use user-provided key or fall back to environment variable
|
| 427 |
-
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
| 428 |
-
|
| 429 |
result = await estimate_cost(
|
| 430 |
model=model,
|
| 431 |
agent_type=agent_type,
|
| 432 |
num_tests=int(num_tests),
|
| 433 |
-
hardware=hardware
|
| 434 |
-
gemini_api_key=api_key
|
| 435 |
)
|
| 436 |
return result
|
| 437 |
except Exception as e:
|
|
@@ -439,7 +280,7 @@ def create_gradio_ui():
|
|
| 439 |
|
| 440 |
cost_button.click(
|
| 441 |
fn=run_estimate_cost,
|
| 442 |
-
inputs=[cost_model, cost_agent_type, cost_num_tests, cost_hardware
|
| 443 |
outputs=[cost_output]
|
| 444 |
)
|
| 445 |
|
|
@@ -482,7 +323,7 @@ def create_gradio_ui():
|
|
| 482 |
compare_button = gr.Button("🔍 Compare Runs", variant="primary")
|
| 483 |
compare_output = gr.Markdown()
|
| 484 |
|
| 485 |
-
async def run_compare_runs(run_id_1, run_id_2, focus, repo
|
| 486 |
"""
|
| 487 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
| 488 |
|
|
@@ -502,16 +343,11 @@ def create_gradio_ui():
|
|
| 502 |
str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
|
| 503 |
"""
|
| 504 |
try:
|
| 505 |
-
# Use user-provided key or fall back to environment variable
|
| 506 |
-
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
| 507 |
-
|
| 508 |
result = await compare_runs(
|
| 509 |
run_id_1=run_id_1,
|
| 510 |
run_id_2=run_id_2,
|
| 511 |
leaderboard_repo=repo,
|
| 512 |
-
comparison_focus=focus
|
| 513 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 514 |
-
gemini_api_key=api_key
|
| 515 |
)
|
| 516 |
return result
|
| 517 |
except Exception as e:
|
|
@@ -519,7 +355,7 @@ def create_gradio_ui():
|
|
| 519 |
|
| 520 |
compare_button.click(
|
| 521 |
fn=run_compare_runs,
|
| 522 |
-
inputs=[compare_run_id_1, compare_run_id_2, compare_focus, compare_repo
|
| 523 |
outputs=[compare_output]
|
| 524 |
)
|
| 525 |
|
|
@@ -558,7 +394,7 @@ def create_gradio_ui():
|
|
| 558 |
results_button = gr.Button("🔍 Analyze Results", variant="primary")
|
| 559 |
results_output = gr.Markdown()
|
| 560 |
|
| 561 |
-
async def run_analyze_results(repo, focus, max_rows
|
| 562 |
"""
|
| 563 |
Analyze detailed test results and provide optimization recommendations.
|
| 564 |
|
|
@@ -576,15 +412,10 @@ def create_gradio_ui():
|
|
| 576 |
if not repo:
|
| 577 |
return "❌ **Error**: Please provide a results repository"
|
| 578 |
|
| 579 |
-
# Use user-provided key or fall back to environment variable
|
| 580 |
-
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
| 581 |
-
|
| 582 |
result = await analyze_results(
|
| 583 |
results_repo=repo,
|
| 584 |
analysis_focus=focus,
|
| 585 |
-
max_rows=int(max_rows)
|
| 586 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 587 |
-
gemini_api_key=api_key
|
| 588 |
)
|
| 589 |
return result
|
| 590 |
except Exception as e:
|
|
@@ -592,7 +423,7 @@ def create_gradio_ui():
|
|
| 592 |
|
| 593 |
results_button.click(
|
| 594 |
fn=run_analyze_results,
|
| 595 |
-
inputs=[results_repo_input, results_focus, results_max_rows
|
| 596 |
outputs=[results_output]
|
| 597 |
)
|
| 598 |
|
|
@@ -629,7 +460,7 @@ def create_gradio_ui():
|
|
| 629 |
dataset_button = gr.Button("📥 Load Dataset", variant="primary")
|
| 630 |
dataset_output = gr.JSON(label="Dataset JSON Output")
|
| 631 |
|
| 632 |
-
async def run_get_dataset(repo, max_rows
|
| 633 |
"""
|
| 634 |
Load SMOLTRACE datasets from HuggingFace and return as JSON.
|
| 635 |
|
|
@@ -649,8 +480,7 @@ def create_gradio_ui():
|
|
| 649 |
import json
|
| 650 |
result = await get_dataset(
|
| 651 |
dataset_repo=repo,
|
| 652 |
-
max_rows=int(max_rows)
|
| 653 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None
|
| 654 |
)
|
| 655 |
# Parse JSON string back to dict for JSON component
|
| 656 |
return json.loads(result)
|
|
@@ -659,7 +489,7 @@ def create_gradio_ui():
|
|
| 659 |
|
| 660 |
dataset_button.click(
|
| 661 |
fn=run_get_dataset,
|
| 662 |
-
inputs=[dataset_repo_input, dataset_max_rows
|
| 663 |
outputs=[dataset_output]
|
| 664 |
)
|
| 665 |
|
|
|
|
| 80 |
|
| 81 |
**MCP Endpoint (SSE - Deprecated)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse`
|
| 82 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# Tab 1: Analyze Leaderboard
|
| 85 |
with gr.Tab("📊 Analyze Leaderboard"):
|
| 86 |
gr.Markdown("### Get AI-powered insights from evaluation leaderboard")
|
|
|
|
| 114 |
with gr.Column():
|
| 115 |
lb_output = gr.Markdown(label="Analysis Results")
|
| 116 |
|
| 117 |
+
async def run_analyze_leaderboard(repo, metric, time_range, top_n):
|
| 118 |
"""
|
| 119 |
Analyze agent evaluation leaderboard and generate AI-powered insights.
|
| 120 |
|
|
|
|
| 134 |
str: Markdown-formatted analysis with top performers, trends, and recommendations
|
| 135 |
"""
|
| 136 |
try:
|
|
|
|
|
|
|
|
|
|
| 137 |
result = await analyze_leaderboard(
|
| 138 |
leaderboard_repo=repo,
|
| 139 |
metric_focus=metric,
|
| 140 |
time_range=time_range,
|
| 141 |
+
top_n=int(top_n)
|
|
|
|
|
|
|
| 142 |
)
|
| 143 |
return result
|
| 144 |
except Exception as e:
|
|
|
|
| 146 |
|
| 147 |
lb_button.click(
|
| 148 |
fn=run_analyze_leaderboard,
|
| 149 |
+
inputs=[lb_repo, lb_metric, lb_time, lb_top_n],
|
| 150 |
outputs=[lb_output]
|
| 151 |
)
|
| 152 |
|
|
|
|
| 176 |
with gr.Column():
|
| 177 |
trace_output = gr.Markdown(label="Debug Analysis")
|
| 178 |
|
| 179 |
+
async def run_debug_trace(trace_id_val, traces_repo_val, question_val):
|
| 180 |
"""
|
| 181 |
Debug a specific agent execution trace using OpenTelemetry data.
|
| 182 |
|
|
|
|
| 198 |
if not trace_id_val or not traces_repo_val:
|
| 199 |
return "❌ **Error**: Please provide both Trace ID and Traces Repository"
|
| 200 |
|
|
|
|
|
|
|
|
|
|
| 201 |
result = await debug_trace(
|
| 202 |
trace_id=trace_id_val,
|
| 203 |
traces_repo=traces_repo_val,
|
| 204 |
+
question=question_val or "Analyze this trace")
|
|
|
|
|
|
|
|
|
|
| 205 |
return result
|
| 206 |
except Exception as e:
|
| 207 |
return f"❌ **Error**: {str(e)}"
|
| 208 |
|
| 209 |
trace_button.click(
|
| 210 |
fn=run_debug_trace,
|
| 211 |
+
inputs=[trace_id, traces_repo, question],
|
| 212 |
outputs=[trace_output]
|
| 213 |
)
|
| 214 |
|
|
|
|
| 246 |
with gr.Column():
|
| 247 |
cost_output = gr.Markdown(label="Cost Estimate")
|
| 248 |
|
| 249 |
+
async def run_estimate_cost(model, agent_type, num_tests, hardware):
|
| 250 |
"""
|
| 251 |
Estimate the cost, duration, and CO2 emissions of running agent evaluations.
|
| 252 |
|
|
|
|
| 268 |
if not model:
|
| 269 |
return "❌ **Error**: Please provide a model name"
|
| 270 |
|
|
|
|
|
|
|
|
|
|
| 271 |
result = await estimate_cost(
|
| 272 |
model=model,
|
| 273 |
agent_type=agent_type,
|
| 274 |
num_tests=int(num_tests),
|
| 275 |
+
hardware=hardware
|
|
|
|
| 276 |
)
|
| 277 |
return result
|
| 278 |
except Exception as e:
|
|
|
|
| 280 |
|
| 281 |
cost_button.click(
|
| 282 |
fn=run_estimate_cost,
|
| 283 |
+
inputs=[cost_model, cost_agent_type, cost_num_tests, cost_hardware],
|
| 284 |
outputs=[cost_output]
|
| 285 |
)
|
| 286 |
|
|
|
|
| 323 |
compare_button = gr.Button("🔍 Compare Runs", variant="primary")
|
| 324 |
compare_output = gr.Markdown()
|
| 325 |
|
| 326 |
+
async def run_compare_runs(run_id_1, run_id_2, focus, repo):
|
| 327 |
"""
|
| 328 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
| 329 |
|
|
|
|
| 343 |
str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
|
| 344 |
"""
|
| 345 |
try:
|
|
|
|
|
|
|
|
|
|
| 346 |
result = await compare_runs(
|
| 347 |
run_id_1=run_id_1,
|
| 348 |
run_id_2=run_id_2,
|
| 349 |
leaderboard_repo=repo,
|
| 350 |
+
comparison_focus=focus
|
|
|
|
|
|
|
| 351 |
)
|
| 352 |
return result
|
| 353 |
except Exception as e:
|
|
|
|
| 355 |
|
| 356 |
compare_button.click(
|
| 357 |
fn=run_compare_runs,
|
| 358 |
+
inputs=[compare_run_id_1, compare_run_id_2, compare_focus, compare_repo],
|
| 359 |
outputs=[compare_output]
|
| 360 |
)
|
| 361 |
|
|
|
|
| 394 |
results_button = gr.Button("🔍 Analyze Results", variant="primary")
|
| 395 |
results_output = gr.Markdown()
|
| 396 |
|
| 397 |
+
async def run_analyze_results(repo, focus, max_rows):
|
| 398 |
"""
|
| 399 |
Analyze detailed test results and provide optimization recommendations.
|
| 400 |
|
|
|
|
| 412 |
if not repo:
|
| 413 |
return "❌ **Error**: Please provide a results repository"
|
| 414 |
|
|
|
|
|
|
|
|
|
|
| 415 |
result = await analyze_results(
|
| 416 |
results_repo=repo,
|
| 417 |
analysis_focus=focus,
|
| 418 |
+
max_rows=int(max_rows)
|
|
|
|
|
|
|
| 419 |
)
|
| 420 |
return result
|
| 421 |
except Exception as e:
|
|
|
|
| 423 |
|
| 424 |
results_button.click(
|
| 425 |
fn=run_analyze_results,
|
| 426 |
+
inputs=[results_repo_input, results_focus, results_max_rows],
|
| 427 |
outputs=[results_output]
|
| 428 |
)
|
| 429 |
|
|
|
|
| 460 |
dataset_button = gr.Button("📥 Load Dataset", variant="primary")
|
| 461 |
dataset_output = gr.JSON(label="Dataset JSON Output")
|
| 462 |
|
| 463 |
+
async def run_get_dataset(repo, max_rows):
|
| 464 |
"""
|
| 465 |
Load SMOLTRACE datasets from HuggingFace and return as JSON.
|
| 466 |
|
|
|
|
| 480 |
import json
|
| 481 |
result = await get_dataset(
|
| 482 |
dataset_repo=repo,
|
| 483 |
+
max_rows=int(max_rows)
|
|
|
|
| 484 |
)
|
| 485 |
# Parse JSON string back to dict for JSON component
|
| 486 |
return json.loads(result)
|
|
|
|
| 489 |
|
| 490 |
dataset_button.click(
|
| 491 |
fn=run_get_dataset,
|
| 492 |
+
inputs=[dataset_repo_input, dataset_max_rows],
|
| 493 |
outputs=[dataset_output]
|
| 494 |
)
|
| 495 |
|
mcp_tools.py
CHANGED
|
@@ -27,9 +27,7 @@ async def analyze_leaderboard(
|
|
| 27 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 28 |
metric_focus: str = "overall",
|
| 29 |
time_range: str = "last_week",
|
| 30 |
-
top_n: int = 5
|
| 31 |
-
hf_token: Optional[str] = None,
|
| 32 |
-
gemini_api_key: Optional[str] = None
|
| 33 |
) -> str:
|
| 34 |
"""
|
| 35 |
Answer questions about the leaderboard with AI-powered analysis and insights.
|
|
@@ -47,26 +45,25 @@ async def analyze_leaderboard(
|
|
| 47 |
agent evaluation results, including top performers, trends, cost/performance
|
| 48 |
trade-offs, and actionable recommendations.
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
Args:
|
| 51 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 52 |
metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
|
| 53 |
time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
|
| 54 |
top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
|
| 55 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 56 |
-
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 57 |
|
| 58 |
Returns:
|
| 59 |
str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
|
| 60 |
"""
|
| 61 |
try:
|
| 62 |
-
# Initialize Gemini client
|
| 63 |
-
gemini_client = GeminiClient(
|
| 64 |
-
# Load leaderboard data from HuggingFace
|
| 65 |
print(f"Loading leaderboard from {leaderboard_repo}...")
|
| 66 |
|
| 67 |
-
|
| 68 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 69 |
-
ds = load_dataset(leaderboard_repo, split="train", token=token)
|
| 70 |
df = pd.DataFrame(ds)
|
| 71 |
|
| 72 |
# Filter by time range
|
|
@@ -135,9 +132,7 @@ async def analyze_leaderboard(
|
|
| 135 |
async def debug_trace(
|
| 136 |
trace_id: str,
|
| 137 |
traces_repo: str,
|
| 138 |
-
question: str = "Analyze this trace and explain what happened"
|
| 139 |
-
hf_token: Optional[str] = None,
|
| 140 |
-
gemini_api_key: Optional[str] = None
|
| 141 |
) -> str:
|
| 142 |
"""
|
| 143 |
Answer questions about agent traces with AI-powered debugging and analysis.
|
|
@@ -159,21 +154,16 @@ async def debug_trace(
|
|
| 159 |
trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
|
| 160 |
traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
|
| 161 |
question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
|
| 162 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 163 |
-
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 164 |
-
|
| 165 |
Returns:
|
| 166 |
str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
|
| 167 |
"""
|
| 168 |
try:
|
| 169 |
# Initialize Gemini client with provided key or from environment
|
| 170 |
-
gemini_client = GeminiClient(
|
| 171 |
-
# Load traces dataset
|
| 172 |
print(f"Loading traces from {traces_repo}...")
|
| 173 |
|
| 174 |
-
|
| 175 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 176 |
-
ds = load_dataset(traces_repo, split="train", token=token)
|
| 177 |
df = pd.DataFrame(ds)
|
| 178 |
|
| 179 |
# Find the specific trace
|
|
@@ -243,8 +233,7 @@ async def estimate_cost(
|
|
| 243 |
model: str,
|
| 244 |
agent_type: str,
|
| 245 |
num_tests: int = 100,
|
| 246 |
-
hardware: str = "auto"
|
| 247 |
-
gemini_api_key: Optional[str] = None
|
| 248 |
) -> str:
|
| 249 |
"""
|
| 250 |
Answer questions about evaluation costs with AI-powered estimates and recommendations.
|
|
@@ -267,14 +256,12 @@ async def estimate_cost(
|
|
| 267 |
agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
|
| 268 |
num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
|
| 269 |
hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
|
| 270 |
-
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 271 |
-
|
| 272 |
Returns:
|
| 273 |
str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
|
| 274 |
"""
|
| 275 |
try:
|
| 276 |
# Initialize Gemini client with provided key or from environment
|
| 277 |
-
gemini_client = GeminiClient(
|
| 278 |
# Determine if API or local model
|
| 279 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 280 |
|
|
@@ -378,9 +365,7 @@ async def compare_runs(
|
|
| 378 |
run_id_1: str,
|
| 379 |
run_id_2: str,
|
| 380 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 381 |
-
comparison_focus: str = "comprehensive"
|
| 382 |
-
hf_token: Optional[str] = None,
|
| 383 |
-
gemini_api_key: Optional[str] = None
|
| 384 |
) -> str:
|
| 385 |
"""
|
| 386 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
|
@@ -394,19 +379,13 @@ async def compare_runs(
|
|
| 394 |
run_id_2 (str): Second run ID to compare
|
| 395 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 396 |
comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
|
| 397 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 398 |
-
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 399 |
-
|
| 400 |
Returns:
|
| 401 |
str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
|
| 402 |
"""
|
| 403 |
try:
|
| 404 |
# Initialize Gemini client with provided key or from environment
|
| 405 |
-
gemini_client = GeminiClient(
|
| 406 |
-
# Load leaderboard data
|
| 407 |
-
# Use user-provided token or fall back to environment variable
|
| 408 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 409 |
-
dataset = load_dataset(leaderboard_repo, split="train", token=token)
|
| 410 |
df = pd.DataFrame(dataset)
|
| 411 |
|
| 412 |
# Find the two runs
|
|
@@ -580,9 +559,7 @@ Provide eco-conscious recommendations for sustainable AI deployment.
|
|
| 580 |
async def analyze_results(
|
| 581 |
results_repo: str,
|
| 582 |
analysis_focus: str = "comprehensive",
|
| 583 |
-
max_rows: int = 100
|
| 584 |
-
hf_token: Optional[str] = None,
|
| 585 |
-
gemini_api_key: Optional[str] = None
|
| 586 |
) -> str:
|
| 587 |
"""
|
| 588 |
Analyze detailed test results and provide optimization recommendations.
|
|
@@ -601,20 +578,17 @@ async def analyze_results(
|
|
| 601 |
results_repo (str): HuggingFace dataset repository containing results (e.g., "username/smoltrace-results-gpt4-20251114")
|
| 602 |
analysis_focus (str): Focus area. Options: "failures", "performance", "cost", "comprehensive". Default: "comprehensive"
|
| 603 |
max_rows (int): Maximum test cases to analyze. Default: 100. Range: 10-500
|
| 604 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 605 |
-
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 606 |
-
|
| 607 |
Returns:
|
| 608 |
str: Markdown-formatted analysis with failure patterns, performance insights, cost analysis, and optimization recommendations
|
| 609 |
"""
|
| 610 |
try:
|
| 611 |
# Initialize Gemini client
|
| 612 |
-
gemini_client = GeminiClient(
|
| 613 |
|
| 614 |
# Load results dataset
|
| 615 |
print(f"Loading results from {results_repo}...")
|
| 616 |
-
|
| 617 |
-
ds = load_dataset(results_repo, split="train"
|
| 618 |
df = pd.DataFrame(ds)
|
| 619 |
|
| 620 |
if df.empty:
|
|
@@ -727,8 +701,7 @@ async def analyze_results(
|
|
| 727 |
@gr.mcp.tool()
|
| 728 |
async def get_dataset(
|
| 729 |
dataset_repo: str,
|
| 730 |
-
max_rows: int = 50
|
| 731 |
-
hf_token: Optional[str] = None
|
| 732 |
) -> str:
|
| 733 |
"""
|
| 734 |
Load SMOLTRACE datasets from HuggingFace and return as JSON.
|
|
@@ -748,8 +721,6 @@ async def get_dataset(
|
|
| 748 |
Args:
|
| 749 |
dataset_repo (str): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard")
|
| 750 |
max_rows (int): Maximum number of rows to return. Default: 50. Range: 1-200
|
| 751 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 752 |
-
|
| 753 |
Returns:
|
| 754 |
str: JSON object with dataset data and metadata
|
| 755 |
"""
|
|
@@ -762,10 +733,7 @@ async def get_dataset(
|
|
| 762 |
"data": []
|
| 763 |
}, indent=2, default=str)
|
| 764 |
|
| 765 |
-
# Load dataset from HuggingFace
|
| 766 |
-
# Use user-provided token or fall back to environment variable
|
| 767 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 768 |
-
dataset = load_dataset(dataset_repo, split="train", token=token)
|
| 769 |
df = pd.DataFrame(dataset)
|
| 770 |
|
| 771 |
if df.empty:
|
|
@@ -815,7 +783,7 @@ async def get_dataset(
|
|
| 815 |
# ============================================================================
|
| 816 |
|
| 817 |
@gr.mcp.resource("leaderboard://{repo}")
|
| 818 |
-
def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard"
|
| 819 |
"""
|
| 820 |
[RAW DATA ONLY] Get raw leaderboard data in JSON format - NO analysis or insights.
|
| 821 |
|
|
@@ -829,17 +797,15 @@ def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard", hf_
|
|
| 829 |
|
| 830 |
For questions, insights, recommendations, or analysis → use analyze_leaderboard tool instead!
|
| 831 |
|
|
|
|
|
|
|
| 832 |
Args:
|
| 833 |
repo (str): HuggingFace dataset repository name. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 834 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 835 |
-
|
| 836 |
Returns:
|
| 837 |
str: Raw JSON string containing all evaluation runs without any analysis
|
| 838 |
"""
|
| 839 |
-
try:
|
| 840 |
-
|
| 841 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 842 |
-
ds = load_dataset(repo, split="train", token=token)
|
| 843 |
df = pd.DataFrame(ds)
|
| 844 |
|
| 845 |
# Convert to JSON with proper formatting
|
|
@@ -858,7 +824,7 @@ def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard", hf_
|
|
| 858 |
|
| 859 |
|
| 860 |
@gr.mcp.resource("trace://{trace_id}/{repo}")
|
| 861 |
-
def get_trace_data(trace_id: str, repo: str
|
| 862 |
"""
|
| 863 |
[RAW DATA ONLY] Get raw OpenTelemetry trace data in JSON format - NO analysis.
|
| 864 |
|
|
@@ -872,18 +838,16 @@ def get_trace_data(trace_id: str, repo: str, hf_token: Optional[str] = None) ->
|
|
| 872 |
|
| 873 |
For debugging, questions, or analysis → use debug_trace tool instead!
|
| 874 |
|
|
|
|
|
|
|
| 875 |
Args:
|
| 876 |
trace_id (str): Unique identifier for the trace (e.g., "trace_abc123")
|
| 877 |
repo (str): HuggingFace dataset repository containing traces (e.g., "username/agent-traces-model")
|
| 878 |
-
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 879 |
-
|
| 880 |
Returns:
|
| 881 |
str: Raw JSON string containing OpenTelemetry spans without any analysis
|
| 882 |
"""
|
| 883 |
-
try:
|
| 884 |
-
|
| 885 |
-
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
| 886 |
-
ds = load_dataset(repo, split="train", token=token)
|
| 887 |
df = pd.DataFrame(ds)
|
| 888 |
|
| 889 |
# Find specific trace
|
|
|
|
| 27 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 28 |
metric_focus: str = "overall",
|
| 29 |
time_range: str = "last_week",
|
| 30 |
+
top_n: int = 5
|
|
|
|
|
|
|
| 31 |
) -> str:
|
| 32 |
"""
|
| 33 |
Answer questions about the leaderboard with AI-powered analysis and insights.
|
|
|
|
| 45 |
agent evaluation results, including top performers, trends, cost/performance
|
| 46 |
trade-offs, and actionable recommendations.
|
| 47 |
|
| 48 |
+
**Security**: Requires GEMINI_API_KEY environment variable.
|
| 49 |
+
**Note**: All SMOLTRACE datasets are public - no HF token required.
|
| 50 |
+
|
| 51 |
Args:
|
| 52 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 53 |
metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
|
| 54 |
time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
|
| 55 |
top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
|
|
|
|
|
|
|
| 56 |
|
| 57 |
Returns:
|
| 58 |
str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
|
| 59 |
"""
|
| 60 |
try:
|
| 61 |
+
# Initialize Gemini client from environment variable only
|
| 62 |
+
gemini_client = GeminiClient()
|
| 63 |
+
# Load leaderboard data from HuggingFace (public dataset)
|
| 64 |
print(f"Loading leaderboard from {leaderboard_repo}...")
|
| 65 |
|
| 66 |
+
ds = load_dataset(leaderboard_repo, split="train")
|
|
|
|
|
|
|
| 67 |
df = pd.DataFrame(ds)
|
| 68 |
|
| 69 |
# Filter by time range
|
|
|
|
| 132 |
async def debug_trace(
|
| 133 |
trace_id: str,
|
| 134 |
traces_repo: str,
|
| 135 |
+
question: str = "Analyze this trace and explain what happened"
|
|
|
|
|
|
|
| 136 |
) -> str:
|
| 137 |
"""
|
| 138 |
Answer questions about agent traces with AI-powered debugging and analysis.
|
|
|
|
| 154 |
trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
|
| 155 |
traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
|
| 156 |
question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
|
|
|
|
|
|
|
|
|
|
| 157 |
Returns:
|
| 158 |
str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
|
| 159 |
"""
|
| 160 |
try:
|
| 161 |
# Initialize Gemini client with provided key or from environment
|
| 162 |
+
gemini_client = GeminiClient()
|
| 163 |
+
# Load traces dataset (public dataset)
|
| 164 |
print(f"Loading traces from {traces_repo}...")
|
| 165 |
|
| 166 |
+
ds = load_dataset(traces_repo, split="train")
|
|
|
|
|
|
|
| 167 |
df = pd.DataFrame(ds)
|
| 168 |
|
| 169 |
# Find the specific trace
|
|
|
|
| 233 |
model: str,
|
| 234 |
agent_type: str,
|
| 235 |
num_tests: int = 100,
|
| 236 |
+
hardware: str = "auto"
|
|
|
|
| 237 |
) -> str:
|
| 238 |
"""
|
| 239 |
Answer questions about evaluation costs with AI-powered estimates and recommendations.
|
|
|
|
| 256 |
agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
|
| 257 |
num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
|
| 258 |
hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
|
|
|
|
|
|
|
| 259 |
Returns:
|
| 260 |
str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
|
| 261 |
"""
|
| 262 |
try:
|
| 263 |
# Initialize Gemini client with provided key or from environment
|
| 264 |
+
gemini_client = GeminiClient()
|
| 265 |
# Determine if API or local model
|
| 266 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 267 |
|
|
|
|
| 365 |
run_id_1: str,
|
| 366 |
run_id_2: str,
|
| 367 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 368 |
+
comparison_focus: str = "comprehensive"
|
|
|
|
|
|
|
| 369 |
) -> str:
|
| 370 |
"""
|
| 371 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
|
|
|
| 379 |
run_id_2 (str): Second run ID to compare
|
| 380 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 381 |
comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
|
|
|
|
|
|
|
|
|
|
| 382 |
Returns:
|
| 383 |
str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
|
| 384 |
"""
|
| 385 |
try:
|
| 386 |
# Initialize Gemini client with provided key or from environment
|
| 387 |
+
gemini_client = GeminiClient()
|
| 388 |
+
# Load leaderboard data dataset = load_dataset(leaderboard_repo, split="train")
|
|
|
|
|
|
|
|
|
|
| 389 |
df = pd.DataFrame(dataset)
|
| 390 |
|
| 391 |
# Find the two runs
|
|
|
|
| 559 |
async def analyze_results(
|
| 560 |
results_repo: str,
|
| 561 |
analysis_focus: str = "comprehensive",
|
| 562 |
+
max_rows: int = 100
|
|
|
|
|
|
|
| 563 |
) -> str:
|
| 564 |
"""
|
| 565 |
Analyze detailed test results and provide optimization recommendations.
|
|
|
|
| 578 |
results_repo (str): HuggingFace dataset repository containing results (e.g., "username/smoltrace-results-gpt4-20251114")
|
| 579 |
analysis_focus (str): Focus area. Options: "failures", "performance", "cost", "comprehensive". Default: "comprehensive"
|
| 580 |
max_rows (int): Maximum test cases to analyze. Default: 100. Range: 10-500
|
|
|
|
|
|
|
|
|
|
| 581 |
Returns:
|
| 582 |
str: Markdown-formatted analysis with failure patterns, performance insights, cost analysis, and optimization recommendations
|
| 583 |
"""
|
| 584 |
try:
|
| 585 |
# Initialize Gemini client
|
| 586 |
+
gemini_client = GeminiClient()
|
| 587 |
|
| 588 |
# Load results dataset
|
| 589 |
print(f"Loading results from {results_repo}...")
|
| 590 |
+
|
| 591 |
+
ds = load_dataset(results_repo, split="train")
|
| 592 |
df = pd.DataFrame(ds)
|
| 593 |
|
| 594 |
if df.empty:
|
|
|
|
| 701 |
@gr.mcp.tool()
|
| 702 |
async def get_dataset(
|
| 703 |
dataset_repo: str,
|
| 704 |
+
max_rows: int = 50
|
|
|
|
| 705 |
) -> str:
|
| 706 |
"""
|
| 707 |
Load SMOLTRACE datasets from HuggingFace and return as JSON.
|
|
|
|
| 721 |
Args:
|
| 722 |
dataset_repo (str): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard")
|
| 723 |
max_rows (int): Maximum number of rows to return. Default: 50. Range: 1-200
|
|
|
|
|
|
|
| 724 |
Returns:
|
| 725 |
str: JSON object with dataset data and metadata
|
| 726 |
"""
|
|
|
|
| 733 |
"data": []
|
| 734 |
}, indent=2, default=str)
|
| 735 |
|
| 736 |
+
# Load dataset from HuggingFace dataset = load_dataset(dataset_repo, split="train")
|
|
|
|
|
|
|
|
|
|
| 737 |
df = pd.DataFrame(dataset)
|
| 738 |
|
| 739 |
if df.empty:
|
|
|
|
| 783 |
# ============================================================================
|
| 784 |
|
| 785 |
@gr.mcp.resource("leaderboard://{repo}")
|
| 786 |
+
def get_leaderboard_data(repo: str = "kshitijthakkar/smoltrace-leaderboard") -> str:
|
| 787 |
"""
|
| 788 |
[RAW DATA ONLY] Get raw leaderboard data in JSON format - NO analysis or insights.
|
| 789 |
|
|
|
|
| 797 |
|
| 798 |
For questions, insights, recommendations, or analysis → use analyze_leaderboard tool instead!
|
| 799 |
|
| 800 |
+
**Note**: All SMOLTRACE datasets are public - no authentication required.
|
| 801 |
+
|
| 802 |
Args:
|
| 803 |
repo (str): HuggingFace dataset repository name. Default: "kshitijthakkar/smoltrace-leaderboard"
|
|
|
|
|
|
|
| 804 |
Returns:
|
| 805 |
str: Raw JSON string containing all evaluation runs without any analysis
|
| 806 |
"""
|
| 807 |
+
try:
|
| 808 |
+
ds = load_dataset(repo, split="train")
|
|
|
|
|
|
|
| 809 |
df = pd.DataFrame(ds)
|
| 810 |
|
| 811 |
# Convert to JSON with proper formatting
|
|
|
|
| 824 |
|
| 825 |
|
| 826 |
@gr.mcp.resource("trace://{trace_id}/{repo}")
|
| 827 |
+
def get_trace_data(trace_id: str, repo: str) -> str:
|
| 828 |
"""
|
| 829 |
[RAW DATA ONLY] Get raw OpenTelemetry trace data in JSON format - NO analysis.
|
| 830 |
|
|
|
|
| 838 |
|
| 839 |
For debugging, questions, or analysis → use debug_trace tool instead!
|
| 840 |
|
| 841 |
+
**Note**: All SMOLTRACE datasets are public - no authentication required.
|
| 842 |
+
|
| 843 |
Args:
|
| 844 |
trace_id (str): Unique identifier for the trace (e.g., "trace_abc123")
|
| 845 |
repo (str): HuggingFace dataset repository containing traces (e.g., "username/agent-traces-model")
|
|
|
|
|
|
|
| 846 |
Returns:
|
| 847 |
str: Raw JSON string containing OpenTelemetry spans without any analysis
|
| 848 |
"""
|
| 849 |
+
try:
|
| 850 |
+
ds = load_dataset(repo, split="train")
|
|
|
|
|
|
|
| 851 |
df = pd.DataFrame(ds)
|
| 852 |
|
| 853 |
# Find specific trace
|