Mandark-droid
commited on
Commit
·
84a5f9c
1
Parent(s):
efe1cbf
Fix: Remove gr.State parameter from MCP tools to resolve ASGI errors
Browse files- Changed gemini_client parameter from GeminiClient object to optional gemini_api_key string
- MCP tools now initialize GeminiClient internally using provided key or environment variable
- This fixes the ASGI protocol error when Claude Desktop calls the MCP tools
- All 4 affected tools updated: analyze_leaderboard, debug_trace, estimate_cost, compare_runs
- Updated app.py wrapper functions to pass gemini_api_key instead of gemini_client object
- Added .gradio/ to .gitignore
- .gitignore +1 -0
- app.py +16 -36
- mcp_tools.py +20 -12
.gitignore
CHANGED
|
@@ -29,6 +29,7 @@ Thumbs.db
|
|
| 29 |
# Gradio
|
| 30 |
flagged/
|
| 31 |
gradio_cached_examples/
|
|
|
|
| 32 |
|
| 33 |
# Logs
|
| 34 |
*.log
|
|
|
|
| 29 |
# Gradio
|
| 30 |
flagged/
|
| 31 |
gradio_cached_examples/
|
| 32 |
+
.gradio/
|
| 33 |
|
| 34 |
# Logs
|
| 35 |
*.log
|
app.py
CHANGED
|
@@ -267,21 +267,16 @@ def create_gradio_ui():
|
|
| 267 |
str: Markdown-formatted analysis with top performers, trends, and recommendations
|
| 268 |
"""
|
| 269 |
try:
|
| 270 |
-
#
|
| 271 |
-
if gemini_key and gemini_key.strip()
|
| 272 |
-
client = GeminiClient(api_key=gemini_key)
|
| 273 |
-
elif default_gemini_client:
|
| 274 |
-
client = default_gemini_client
|
| 275 |
-
else:
|
| 276 |
-
return "❌ **Error**: No Gemini API key configured. Please set it in the Settings tab."
|
| 277 |
|
| 278 |
result = await analyze_leaderboard(
|
| 279 |
-
gemini_client=client,
|
| 280 |
leaderboard_repo=repo,
|
| 281 |
metric_focus=metric,
|
| 282 |
time_range=time_range,
|
| 283 |
top_n=int(top_n),
|
| 284 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None
|
|
|
|
| 285 |
)
|
| 286 |
return result
|
| 287 |
except Exception as e:
|
|
@@ -341,20 +336,15 @@ def create_gradio_ui():
|
|
| 341 |
if not trace_id_val or not traces_repo_val:
|
| 342 |
return "❌ **Error**: Please provide both Trace ID and Traces Repository"
|
| 343 |
|
| 344 |
-
#
|
| 345 |
-
if gemini_key and gemini_key.strip()
|
| 346 |
-
client = GeminiClient(api_key=gemini_key)
|
| 347 |
-
elif default_gemini_client:
|
| 348 |
-
client = default_gemini_client
|
| 349 |
-
else:
|
| 350 |
-
return "❌ **Error**: No Gemini API key configured. Please set it in the Settings tab."
|
| 351 |
|
| 352 |
result = await debug_trace(
|
| 353 |
-
gemini_client=client,
|
| 354 |
trace_id=trace_id_val,
|
| 355 |
traces_repo=traces_repo_val,
|
| 356 |
question=question_val or "Analyze this trace",
|
| 357 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None
|
|
|
|
| 358 |
)
|
| 359 |
return result
|
| 360 |
except Exception as e:
|
|
@@ -422,20 +412,15 @@ def create_gradio_ui():
|
|
| 422 |
if not model:
|
| 423 |
return "❌ **Error**: Please provide a model name"
|
| 424 |
|
| 425 |
-
#
|
| 426 |
-
if gemini_key and gemini_key.strip()
|
| 427 |
-
client = GeminiClient(api_key=gemini_key)
|
| 428 |
-
elif default_gemini_client:
|
| 429 |
-
client = default_gemini_client
|
| 430 |
-
else:
|
| 431 |
-
return "❌ **Error**: No Gemini API key configured. Please set it in the Settings tab."
|
| 432 |
|
| 433 |
result = await estimate_cost(
|
| 434 |
-
gemini_client=client,
|
| 435 |
model=model,
|
| 436 |
agent_type=agent_type,
|
| 437 |
num_tests=int(num_tests),
|
| 438 |
-
hardware=hardware
|
|
|
|
| 439 |
)
|
| 440 |
return result
|
| 441 |
except Exception as e:
|
|
@@ -506,21 +491,16 @@ def create_gradio_ui():
|
|
| 506 |
str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
|
| 507 |
"""
|
| 508 |
try:
|
| 509 |
-
#
|
| 510 |
-
if gemini_key and gemini_key.strip()
|
| 511 |
-
client = GeminiClient(api_key=gemini_key)
|
| 512 |
-
elif default_gemini_client:
|
| 513 |
-
client = default_gemini_client
|
| 514 |
-
else:
|
| 515 |
-
return "❌ **Error**: No Gemini API key configured. Please set it in the Settings tab."
|
| 516 |
|
| 517 |
result = await compare_runs(
|
| 518 |
-
gemini_client=client,
|
| 519 |
run_id_1=run_id_1,
|
| 520 |
run_id_2=run_id_2,
|
| 521 |
leaderboard_repo=repo,
|
| 522 |
comparison_focus=focus,
|
| 523 |
-
hf_token=hf_token if hf_token and hf_token.strip() else None
|
|
|
|
| 524 |
)
|
| 525 |
return result
|
| 526 |
except Exception as e:
|
|
|
|
| 267 |
str: Markdown-formatted analysis with top performers, trends, and recommendations
|
| 268 |
"""
|
| 269 |
try:
|
| 270 |
+
# Use user-provided key or fall back to environment variable
|
| 271 |
+
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
result = await analyze_leaderboard(
|
|
|
|
| 274 |
leaderboard_repo=repo,
|
| 275 |
metric_focus=metric,
|
| 276 |
time_range=time_range,
|
| 277 |
top_n=int(top_n),
|
| 278 |
+
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 279 |
+
gemini_api_key=api_key
|
| 280 |
)
|
| 281 |
return result
|
| 282 |
except Exception as e:
|
|
|
|
| 336 |
if not trace_id_val or not traces_repo_val:
|
| 337 |
return "❌ **Error**: Please provide both Trace ID and Traces Repository"
|
| 338 |
|
| 339 |
+
# Use user-provided key or fall back to environment variable
|
| 340 |
+
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
result = await debug_trace(
|
|
|
|
| 343 |
trace_id=trace_id_val,
|
| 344 |
traces_repo=traces_repo_val,
|
| 345 |
question=question_val or "Analyze this trace",
|
| 346 |
+
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 347 |
+
gemini_api_key=api_key
|
| 348 |
)
|
| 349 |
return result
|
| 350 |
except Exception as e:
|
|
|
|
| 412 |
if not model:
|
| 413 |
return "❌ **Error**: Please provide a model name"
|
| 414 |
|
| 415 |
+
# Use user-provided key or fall back to environment variable
|
| 416 |
+
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
result = await estimate_cost(
|
|
|
|
| 419 |
model=model,
|
| 420 |
agent_type=agent_type,
|
| 421 |
num_tests=int(num_tests),
|
| 422 |
+
hardware=hardware,
|
| 423 |
+
gemini_api_key=api_key
|
| 424 |
)
|
| 425 |
return result
|
| 426 |
except Exception as e:
|
|
|
|
| 491 |
str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations
|
| 492 |
"""
|
| 493 |
try:
|
| 494 |
+
# Use user-provided key or fall back to environment variable
|
| 495 |
+
api_key = gemini_key if gemini_key and gemini_key.strip() else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
result = await compare_runs(
|
|
|
|
| 498 |
run_id_1=run_id_1,
|
| 499 |
run_id_2=run_id_2,
|
| 500 |
leaderboard_repo=repo,
|
| 501 |
comparison_focus=focus,
|
| 502 |
+
hf_token=hf_token if hf_token and hf_token.strip() else None,
|
| 503 |
+
gemini_api_key=api_key
|
| 504 |
)
|
| 505 |
return result
|
| 506 |
except Exception as e:
|
mcp_tools.py
CHANGED
|
@@ -23,12 +23,12 @@ from gemini_client import GeminiClient
|
|
| 23 |
|
| 24 |
|
| 25 |
async def analyze_leaderboard(
|
| 26 |
-
gemini_client: GeminiClient,
|
| 27 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 28 |
metric_focus: str = "overall",
|
| 29 |
time_range: str = "last_week",
|
| 30 |
top_n: int = 5,
|
| 31 |
-
hf_token: Optional[str] = None
|
|
|
|
| 32 |
) -> str:
|
| 33 |
"""
|
| 34 |
Analyze evaluation leaderboard and generate AI-powered insights.
|
|
@@ -38,17 +38,19 @@ async def analyze_leaderboard(
|
|
| 38 |
trends, cost/performance trade-offs, and actionable recommendations.
|
| 39 |
|
| 40 |
Args:
|
| 41 |
-
gemini_client (GeminiClient): Initialized Gemini client for AI analysis
|
| 42 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 43 |
metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
|
| 44 |
time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
|
| 45 |
top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
|
| 46 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
|
|
|
| 47 |
|
| 48 |
Returns:
|
| 49 |
str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
|
| 50 |
"""
|
| 51 |
try:
|
|
|
|
|
|
|
| 52 |
# Load leaderboard data from HuggingFace
|
| 53 |
print(f"Loading leaderboard from {leaderboard_repo}...")
|
| 54 |
|
|
@@ -120,11 +122,11 @@ async def analyze_leaderboard(
|
|
| 120 |
|
| 121 |
|
| 122 |
async def debug_trace(
|
| 123 |
-
gemini_client: GeminiClient,
|
| 124 |
trace_id: str,
|
| 125 |
traces_repo: str,
|
| 126 |
question: str = "Analyze this trace and explain what happened",
|
| 127 |
-
hf_token: Optional[str] = None
|
|
|
|
| 128 |
) -> str:
|
| 129 |
"""
|
| 130 |
Debug a specific agent execution trace using OpenTelemetry data.
|
|
@@ -134,16 +136,18 @@ async def debug_trace(
|
|
| 134 |
identify bottlenecks, and explain agent behavior.
|
| 135 |
|
| 136 |
Args:
|
| 137 |
-
gemini_client (GeminiClient): Initialized Gemini client for AI analysis
|
| 138 |
trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
|
| 139 |
traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
|
| 140 |
question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
|
| 141 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
|
|
|
| 142 |
|
| 143 |
Returns:
|
| 144 |
str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
|
| 145 |
"""
|
| 146 |
try:
|
|
|
|
|
|
|
| 147 |
# Load traces dataset
|
| 148 |
print(f"Loading traces from {traces_repo}...")
|
| 149 |
|
|
@@ -215,11 +219,11 @@ async def debug_trace(
|
|
| 215 |
|
| 216 |
|
| 217 |
async def estimate_cost(
|
| 218 |
-
gemini_client: GeminiClient,
|
| 219 |
model: str,
|
| 220 |
agent_type: str,
|
| 221 |
num_tests: int = 100,
|
| 222 |
-
hardware: str = "auto"
|
|
|
|
| 223 |
) -> str:
|
| 224 |
"""
|
| 225 |
Estimate the cost, duration, and CO2 emissions of running agent evaluations.
|
|
@@ -229,16 +233,18 @@ async def estimate_cost(
|
|
| 229 |
to provide cost breakdown and optimization recommendations.
|
| 230 |
|
| 231 |
Args:
|
| 232 |
-
gemini_client (GeminiClient): Initialized Gemini client for AI analysis
|
| 233 |
model (str): Model identifier in litellm format (e.g., "openai/gpt-4", "meta-llama/Llama-3.1-8B")
|
| 234 |
agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
|
| 235 |
num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
|
| 236 |
hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
|
|
|
|
| 237 |
|
| 238 |
Returns:
|
| 239 |
str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
|
| 240 |
"""
|
| 241 |
try:
|
|
|
|
|
|
|
| 242 |
# Determine if API or local model
|
| 243 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 244 |
|
|
@@ -338,12 +344,12 @@ async def estimate_cost(
|
|
| 338 |
|
| 339 |
|
| 340 |
async def compare_runs(
|
| 341 |
-
gemini_client: GeminiClient,
|
| 342 |
run_id_1: str,
|
| 343 |
run_id_2: str,
|
| 344 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 345 |
comparison_focus: str = "comprehensive",
|
| 346 |
-
hf_token: Optional[str] = None
|
|
|
|
| 347 |
) -> str:
|
| 348 |
"""
|
| 349 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
|
@@ -353,17 +359,19 @@ async def compare_runs(
|
|
| 353 |
success rate, cost efficiency, speed, environmental impact, and use case recommendations.
|
| 354 |
|
| 355 |
Args:
|
| 356 |
-
gemini_client (GeminiClient): Initialized Gemini client for AI analysis
|
| 357 |
run_id_1 (str): First run ID to compare
|
| 358 |
run_id_2 (str): Second run ID to compare
|
| 359 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 360 |
comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
|
| 361 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
|
|
|
| 362 |
|
| 363 |
Returns:
|
| 364 |
str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
|
| 365 |
"""
|
| 366 |
try:
|
|
|
|
|
|
|
| 367 |
# Load leaderboard data
|
| 368 |
# Use user-provided token or fall back to environment variable
|
| 369 |
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
async def analyze_leaderboard(
|
|
|
|
| 26 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 27 |
metric_focus: str = "overall",
|
| 28 |
time_range: str = "last_week",
|
| 29 |
top_n: int = 5,
|
| 30 |
+
hf_token: Optional[str] = None,
|
| 31 |
+
gemini_api_key: Optional[str] = None
|
| 32 |
) -> str:
|
| 33 |
"""
|
| 34 |
Analyze evaluation leaderboard and generate AI-powered insights.
|
|
|
|
| 38 |
trends, cost/performance trade-offs, and actionable recommendations.
|
| 39 |
|
| 40 |
Args:
|
|
|
|
| 41 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 42 |
metric_focus (str): Primary metric to focus analysis on. Options: "overall", "accuracy", "cost", "latency", "co2". Default: "overall"
|
| 43 |
time_range (str): Time range for analysis. Options: "last_week", "last_month", "all_time". Default: "last_week"
|
| 44 |
top_n (int): Number of top models to highlight in analysis. Must be between 3 and 10. Default: 5
|
| 45 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 46 |
+
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 47 |
|
| 48 |
Returns:
|
| 49 |
str: Markdown-formatted analysis with top performers, insights, trade-offs, and recommendations
|
| 50 |
"""
|
| 51 |
try:
|
| 52 |
+
# Initialize Gemini client with provided key or from environment
|
| 53 |
+
gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
|
| 54 |
# Load leaderboard data from HuggingFace
|
| 55 |
print(f"Loading leaderboard from {leaderboard_repo}...")
|
| 56 |
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
async def debug_trace(
|
|
|
|
| 125 |
trace_id: str,
|
| 126 |
traces_repo: str,
|
| 127 |
question: str = "Analyze this trace and explain what happened",
|
| 128 |
+
hf_token: Optional[str] = None,
|
| 129 |
+
gemini_api_key: Optional[str] = None
|
| 130 |
) -> str:
|
| 131 |
"""
|
| 132 |
Debug a specific agent execution trace using OpenTelemetry data.
|
|
|
|
| 136 |
identify bottlenecks, and explain agent behavior.
|
| 137 |
|
| 138 |
Args:
|
|
|
|
| 139 |
trace_id (str): Unique identifier for the trace to analyze (e.g., "trace_abc123")
|
| 140 |
traces_repo (str): HuggingFace dataset repository containing trace data (e.g., "username/agent-traces-model-timestamp")
|
| 141 |
question (str): Specific question about the trace. Default: "Analyze this trace and explain what happened"
|
| 142 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 143 |
+
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 144 |
|
| 145 |
Returns:
|
| 146 |
str: Markdown-formatted debug analysis with step-by-step breakdown, timing information, and answer to the question
|
| 147 |
"""
|
| 148 |
try:
|
| 149 |
+
# Initialize Gemini client with provided key or from environment
|
| 150 |
+
gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
|
| 151 |
# Load traces dataset
|
| 152 |
print(f"Loading traces from {traces_repo}...")
|
| 153 |
|
|
|
|
| 219 |
|
| 220 |
|
| 221 |
async def estimate_cost(
|
|
|
|
| 222 |
model: str,
|
| 223 |
agent_type: str,
|
| 224 |
num_tests: int = 100,
|
| 225 |
+
hardware: str = "auto",
|
| 226 |
+
gemini_api_key: Optional[str] = None
|
| 227 |
) -> str:
|
| 228 |
"""
|
| 229 |
Estimate the cost, duration, and CO2 emissions of running agent evaluations.
|
|
|
|
| 233 |
to provide cost breakdown and optimization recommendations.
|
| 234 |
|
| 235 |
Args:
|
|
|
|
| 236 |
model (str): Model identifier in litellm format (e.g., "openai/gpt-4", "meta-llama/Llama-3.1-8B")
|
| 237 |
agent_type (str): Type of agent capabilities to test. Options: "tool", "code", "both"
|
| 238 |
num_tests (int): Number of test cases to run. Must be between 10 and 1000. Default: 100
|
| 239 |
hardware (str): Hardware type for HuggingFace Jobs. Options: "auto", "cpu", "gpu_a10", "gpu_h200". Default: "auto"
|
| 240 |
+
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 241 |
|
| 242 |
Returns:
|
| 243 |
str: Markdown-formatted cost estimate with breakdown of LLM costs, HF Jobs costs, duration, CO2 emissions, and optimization tips
|
| 244 |
"""
|
| 245 |
try:
|
| 246 |
+
# Initialize Gemini client with provided key or from environment
|
| 247 |
+
gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
|
| 248 |
# Determine if API or local model
|
| 249 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 250 |
|
|
|
|
| 344 |
|
| 345 |
|
| 346 |
async def compare_runs(
|
|
|
|
| 347 |
run_id_1: str,
|
| 348 |
run_id_2: str,
|
| 349 |
leaderboard_repo: str = "kshitijthakkar/smoltrace-leaderboard",
|
| 350 |
comparison_focus: str = "comprehensive",
|
| 351 |
+
hf_token: Optional[str] = None,
|
| 352 |
+
gemini_api_key: Optional[str] = None
|
| 353 |
) -> str:
|
| 354 |
"""
|
| 355 |
Compare two evaluation runs and generate AI-powered comparative analysis.
|
|
|
|
| 359 |
success rate, cost efficiency, speed, environmental impact, and use case recommendations.
|
| 360 |
|
| 361 |
Args:
|
|
|
|
| 362 |
run_id_1 (str): First run ID to compare
|
| 363 |
run_id_2 (str): Second run ID to compare
|
| 364 |
leaderboard_repo (str): HuggingFace dataset repository containing leaderboard data. Default: "kshitijthakkar/smoltrace-leaderboard"
|
| 365 |
comparison_focus (str): Focus area for comparison. Options: "comprehensive", "cost", "performance", "eco_friendly". Default: "comprehensive"
|
| 366 |
hf_token (Optional[str]): HuggingFace token for dataset access. If None, uses HF_TOKEN environment variable.
|
| 367 |
+
gemini_api_key (Optional[str]): Google Gemini API key. If None, uses GEMINI_API_KEY environment variable.
|
| 368 |
|
| 369 |
Returns:
|
| 370 |
str: Markdown-formatted comparative analysis with winner for each category, trade-offs, and use case recommendations
|
| 371 |
"""
|
| 372 |
try:
|
| 373 |
+
# Initialize Gemini client with provided key or from environment
|
| 374 |
+
gemini_client = GeminiClient(api_key=gemini_api_key) if gemini_api_key else GeminiClient()
|
| 375 |
# Load leaderboard data
|
| 376 |
# Use user-provided token or fall back to environment variable
|
| 377 |
token = hf_token if hf_token else os.getenv("HF_TOKEN")
|