Mandark-droid commited on
Commit
7f90c34
·
1 Parent(s): 0fca968

Implement HTMLPlus for clickable leaderboard rows in By Model tab

Browse files

- Add gradio_htmlplus to requirements.txt
- Update leaderboard_table.py to add comprehensive data attributes to table rows
- Convert leaderboard_by_model from gr.HTML to HTMLPlus component
- Add on_html_leaderboard_select() event handler for HTMLPlus row selection
- Keep drilldown tab using gr.Dataframe (no changes to drilldown functionality)
- Enable clicking on leaderboard rows to navigate to run detail screen

Files changed (3) hide show
  1. app.py +233 -3
  2. components/leaderboard_table.py +33 -80
  3. requirements.txt +1 -0
app.py CHANGED
@@ -6,6 +6,7 @@ Enterprise-grade AI agent evaluation with MCP integration
6
  import os
7
  import pandas as pd
8
  import gradio as gr
 
9
  from dotenv import load_dotenv
10
 
11
  # Load environment variables
@@ -504,7 +505,7 @@ def apply_sidebar_filters(selected_model, selected_agent_type):
504
  sorted_df = df.sort_values(by='success_rate', ascending=False).reset_index(drop=True)
505
  html = generate_leaderboard_html(sorted_df, 'success_rate', False)
506
 
507
- # For drilldown table
508
  display_df = df[[
509
  'run_id', 'model', 'agent_type', 'provider', 'success_rate',
510
  'total_tests', 'avg_duration_ms', 'total_cost_usd', 'submitted_by'
@@ -1132,6 +1133,214 @@ def on_drilldown_select(evt: gr.SelectData, df):
1132
 
1133
 
1134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1135
  def go_back_to_leaderboard():
1136
  """Navigate back to leaderboard screen"""
1137
  return {
@@ -1292,8 +1501,12 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1292
  with gr.Row():
1293
  apply_filters_btn = gr.Button("🔍 Apply Filters", variant="primary", size="sm")
1294
 
1295
- # Styled HTML leaderboard
1296
- leaderboard_by_model = gr.HTML(label="Styled Leaderboard")
 
 
 
 
1297
 
1298
  with gr.TabItem("📋 DrillDown"):
1299
  gr.Markdown("*Click any row to view detailed run information*")
@@ -1778,6 +1991,23 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1778
  outputs=[leaderboard_by_model]
1779
  )
1780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1781
  # DrillDown tab inline filters
1782
  apply_drilldown_filters_btn.click(
1783
  fn=apply_drilldown_filters,
 
6
  import os
7
  import pandas as pd
8
  import gradio as gr
9
+ from gradio_htmlplus import HTMLPlus
10
  from dotenv import load_dotenv
11
 
12
  # Load environment variables
 
505
  sorted_df = df.sort_values(by='success_rate', ascending=False).reset_index(drop=True)
506
  html = generate_leaderboard_html(sorted_df, 'success_rate', False)
507
 
508
+ # For drilldown table (DataFrame)
509
  display_df = df[[
510
  'run_id', 'model', 'agent_type', 'provider', 'success_rate',
511
  'total_tests', 'avg_duration_ms', 'total_cost_usd', 'submitted_by'
 
1133
 
1134
 
1135
 
1136
+ def on_html_leaderboard_select(evt: gr.SelectData):
1137
+ """Handle row selection from HTMLPlus leaderboard (By Model tab)"""
1138
+ global current_selected_run, leaderboard_df_cache
1139
+
1140
+ try:
1141
+ # HTMLPlus returns data attributes from the selected row
1142
+ # evt.index = CSS selector that was matched (e.g., "tr")
1143
+ # evt.value = dictionary of data-* attributes from the HTML element
1144
+
1145
+ if evt.index != "tr":
1146
+ gr.Warning("Invalid selection")
1147
+ return {
1148
+ leaderboard_screen: gr.update(visible=True),
1149
+ run_detail_screen: gr.update(visible=False),
1150
+ run_metadata_html: gr.update(value="<h3>Invalid selection</h3>"),
1151
+ test_cases_table: gr.update(value=pd.DataFrame()),
1152
+ performance_charts: gr.update(),
1153
+ run_card_html: gr.update(),
1154
+ run_gpu_summary_cards_html: gr.update(),
1155
+ run_gpu_metrics_plot: gr.update(),
1156
+ run_gpu_metrics_json: gr.update()
1157
+ }
1158
+
1159
+ # Get the run_id from the data attributes
1160
+ row_data = evt.value
1161
+ run_id = row_data.get('run-id') # Note: HTML data attributes use hyphens
1162
+
1163
+ if not run_id:
1164
+ gr.Warning("No run ID found in selection")
1165
+ return {
1166
+ leaderboard_screen: gr.update(visible=True),
1167
+ run_detail_screen: gr.update(visible=False),
1168
+ run_metadata_html: gr.update(value="<h3>No run ID found</h3>"),
1169
+ test_cases_table: gr.update(value=pd.DataFrame()),
1170
+ performance_charts: gr.update(),
1171
+ run_card_html: gr.update(),
1172
+ run_gpu_summary_cards_html: gr.update(),
1173
+ run_gpu_metrics_plot: gr.update(),
1174
+ run_gpu_metrics_json: gr.update()
1175
+ }
1176
+
1177
+ print(f"[DEBUG] HTMLPlus selected row with run_id: {run_id[:8]}...")
1178
+
1179
+ # Find the full run data from the cached leaderboard dataframe using run_id
1180
+ if leaderboard_df_cache is not None and not leaderboard_df_cache.empty:
1181
+ matching_rows = leaderboard_df_cache[leaderboard_df_cache['run_id'] == run_id]
1182
+ if not matching_rows.empty:
1183
+ run_data = matching_rows.iloc[0].to_dict()
1184
+ else:
1185
+ gr.Warning(f"Run ID {run_id[:8]}... not found in leaderboard data")
1186
+ return {
1187
+ leaderboard_screen: gr.update(visible=True),
1188
+ run_detail_screen: gr.update(visible=False),
1189
+ run_metadata_html: gr.update(value="<h3>Run not found</h3>"),
1190
+ test_cases_table: gr.update(value=pd.DataFrame()),
1191
+ performance_charts: gr.update(),
1192
+ run_card_html: gr.update(),
1193
+ run_gpu_summary_cards_html: gr.update(),
1194
+ run_gpu_metrics_plot: gr.update(),
1195
+ run_gpu_metrics_json: gr.update()
1196
+ }
1197
+ else:
1198
+ gr.Warning("Leaderboard data not available")
1199
+ return {
1200
+ leaderboard_screen: gr.update(visible=True),
1201
+ run_detail_screen: gr.update(visible=False),
1202
+ run_metadata_html: gr.update(value="<h3>Leaderboard data not available</h3>"),
1203
+ test_cases_table: gr.update(value=pd.DataFrame()),
1204
+ performance_charts: gr.update(),
1205
+ run_card_html: gr.update(),
1206
+ run_gpu_summary_cards_html: gr.update(),
1207
+ run_gpu_metrics_plot: gr.update(),
1208
+ run_gpu_metrics_json: gr.update()
1209
+ }
1210
+
1211
+ # IMPORTANT: Set global FIRST before any operations that might fail
1212
+ current_selected_run = run_data
1213
+
1214
+ print(f"[DEBUG] Selected run: {run_data.get('model', 'Unknown')} (run_id: {run_data.get('run_id', 'N/A')[:8]}...)")
1215
+
1216
+ # Load results for this run
1217
+ results_dataset = run_data.get('results_dataset')
1218
+ if not results_dataset:
1219
+ gr.Warning("No results dataset found for this run")
1220
+ return {
1221
+ leaderboard_screen: gr.update(visible=True),
1222
+ run_detail_screen: gr.update(visible=False),
1223
+ run_metadata_html: gr.update(value="<h3>No results dataset found</h3>"),
1224
+ test_cases_table: gr.update(value=pd.DataFrame()),
1225
+ performance_charts: gr.update(),
1226
+ run_card_html: gr.update(),
1227
+ run_gpu_summary_cards_html: gr.update(),
1228
+ run_gpu_metrics_plot: gr.update(),
1229
+ run_gpu_metrics_json: gr.update()
1230
+ }
1231
+
1232
+ results_df = data_loader.load_results(results_dataset)
1233
+
1234
+ # Generate performance chart
1235
+ perf_chart = create_performance_charts(results_df)
1236
+
1237
+ # Create metadata HTML
1238
+ metadata_html = f"""
1239
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1240
+ padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
1241
+ <h2 style="margin: 0 0 10px 0;">📊 Run Detail: {run_data.get('model', 'Unknown')}</h2>
1242
+ <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
1243
+ <div>
1244
+ <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
1245
+ <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
1246
+ <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
1247
+ </div>
1248
+ <div>
1249
+ <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
1250
+ <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
1251
+ <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
1252
+ </div>
1253
+ <div>
1254
+ <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
1255
+ <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
1256
+ <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
1257
+ </div>
1258
+ </div>
1259
+ </div>
1260
+ """
1261
+
1262
+ # Generate run report card HTML
1263
+ run_card_html_content = generate_run_report_card(run_data)
1264
+
1265
+ # Format results for display
1266
+ display_df = results_df.copy()
1267
+
1268
+ # Select and format columns if they exist
1269
+ display_columns = []
1270
+ if 'task_id' in display_df.columns:
1271
+ display_columns.append('task_id')
1272
+ if 'success' in display_df.columns:
1273
+ display_df['success'] = display_df['success'].apply(lambda x: "✅" if x else "❌")
1274
+ display_columns.append('success')
1275
+ if 'tool_called' in display_df.columns:
1276
+ display_columns.append('tool_called')
1277
+ if 'execution_time_ms' in display_df.columns:
1278
+ display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
1279
+ display_columns.append('execution_time_ms')
1280
+ if 'total_tokens' in display_df.columns:
1281
+ display_columns.append('total_tokens')
1282
+ if 'cost_usd' in display_df.columns:
1283
+ display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
1284
+ display_columns.append('cost_usd')
1285
+ if 'trace_id' in display_df.columns:
1286
+ display_columns.append('trace_id')
1287
+
1288
+ if display_columns:
1289
+ display_df = display_df[display_columns]
1290
+
1291
+ # Load GPU metrics (if available)
1292
+ gpu_summary_html = "<div style='padding: 20px; text-align: center;'>⚠️ No GPU metrics available (expected for API models)</div>"
1293
+ gpu_plot = None
1294
+ gpu_json_data = {}
1295
+
1296
+ try:
1297
+ if 'metrics_dataset' in run_data and run_data.get('metrics_dataset'):
1298
+ metrics_dataset = run_data['metrics_dataset']
1299
+ gpu_metrics_data = data_loader.load_metrics(metrics_dataset)
1300
+
1301
+ if gpu_metrics_data is not None and not gpu_metrics_data.empty:
1302
+ from screens.trace_detail import create_gpu_metrics_dashboard, create_gpu_summary_cards
1303
+ gpu_plot = create_gpu_metrics_dashboard(gpu_metrics_data)
1304
+ gpu_summary_html = create_gpu_summary_cards(gpu_metrics_data)
1305
+ gpu_json_data = gpu_metrics_data.to_dict('records')
1306
+ except Exception as e:
1307
+ print(f"[WARNING] Could not load GPU metrics for run: {e}")
1308
+
1309
+ print(f"[DEBUG] Successfully loaded run detail for: {run_data.get('model', 'Unknown')}")
1310
+
1311
+ return {
1312
+ # Hide leaderboard, show run detail
1313
+ leaderboard_screen: gr.update(visible=False),
1314
+ run_detail_screen: gr.update(visible=True),
1315
+ run_metadata_html: gr.update(value=metadata_html),
1316
+ test_cases_table: gr.update(value=display_df),
1317
+ performance_charts: gr.update(value=perf_chart),
1318
+ run_card_html: gr.update(value=run_card_html_content),
1319
+ run_gpu_summary_cards_html: gr.update(value=gpu_summary_html),
1320
+ run_gpu_metrics_plot: gr.update(value=gpu_plot),
1321
+ run_gpu_metrics_json: gr.update(value=gpu_json_data)
1322
+ }
1323
+
1324
+ except Exception as e:
1325
+ print(f"[ERROR] Loading run details from HTMLPlus: {e}")
1326
+ import traceback
1327
+ traceback.print_exc()
1328
+ gr.Warning(f"Error loading run details: {e}")
1329
+
1330
+ # Return updates for all output components to avoid Gradio error
1331
+ return {
1332
+ leaderboard_screen: gr.update(visible=True), # Stay on leaderboard
1333
+ run_detail_screen: gr.update(visible=False),
1334
+ run_metadata_html: gr.update(value="<h3>Error loading run detail</h3>"),
1335
+ test_cases_table: gr.update(value=pd.DataFrame()),
1336
+ performance_charts: gr.update(),
1337
+ run_card_html: gr.update(),
1338
+ run_gpu_summary_cards_html: gr.update(),
1339
+ run_gpu_metrics_plot: gr.update(),
1340
+ run_gpu_metrics_json: gr.update()
1341
+ }
1342
+
1343
+
1344
  def go_back_to_leaderboard():
1345
  """Navigate back to leaderboard screen"""
1346
  return {
 
1501
  with gr.Row():
1502
  apply_filters_btn = gr.Button("🔍 Apply Filters", variant="primary", size="sm")
1503
 
1504
+ # Styled HTML leaderboard with clickable rows
1505
+ leaderboard_by_model = HTMLPlus(
1506
+ label="Styled Leaderboard",
1507
+ value="<p>Loading leaderboard...</p>",
1508
+ selectable_elements=["tr"] # Make table rows clickable
1509
+ )
1510
 
1511
  with gr.TabItem("📋 DrillDown"):
1512
  gr.Markdown("*Click any row to view detailed run information*")
 
1991
  outputs=[leaderboard_by_model]
1992
  )
1993
 
1994
+ # HTML Plus leaderboard row selection
1995
+ leaderboard_by_model.select(
1996
+ fn=on_html_leaderboard_select,
1997
+ inputs=None, # HTMLPlus passes data via evt.value
1998
+ outputs=[
1999
+ leaderboard_screen,
2000
+ run_detail_screen,
2001
+ run_metadata_html,
2002
+ test_cases_table,
2003
+ performance_charts,
2004
+ run_card_html,
2005
+ run_gpu_summary_cards_html,
2006
+ run_gpu_metrics_plot,
2007
+ run_gpu_metrics_json
2008
+ ]
2009
+ )
2010
+
2011
  # DrillDown tab inline filters
2012
  apply_drilldown_filters_btn.click(
2013
  fn=apply_drilldown_filters,
components/leaderboard_table.py CHANGED
@@ -346,8 +346,40 @@ def generate_leaderboard_html(
346
  run_id = row.get('run_id', 'N/A')
347
  run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
348
 
 
 
 
 
 
349
  html += f"""
350
- <tr data-run-id="{run_id}" data-rank="{rank}" class="tm-clickable-row">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  <td>{get_rank_badge(rank)}</td>
352
  <td class="tm-run-id" title="{run_id}">{run_id_short}</td>
353
  <td class="tm-model-name">{model}</td>
@@ -382,85 +414,6 @@ def generate_leaderboard_html(
382
  </tbody>
383
  </table>
384
  </div>
385
-
386
- <script>
387
- // Add click handler for Run ID cells - runs on each table render
388
- (function() {
389
- // Function to attach handlers
390
- function attachRowClickHandlers() {
391
- const cells = document.querySelectorAll('.tm-run-id');
392
- console.log('Found', cells.length, 'Run ID cells');
393
-
394
- cells.forEach(function(cell) {
395
- // Remove existing listener to avoid duplicates
396
- cell.replaceWith(cell.cloneNode(true));
397
- });
398
-
399
- // Re-select after cloning
400
- document.querySelectorAll('.tm-run-id').forEach(function(cell) {
401
- cell.addEventListener('click', function(e) {
402
- e.stopPropagation();
403
- const row = this.closest('tr');
404
- const rowIndex = Array.from(row.parentNode.children).indexOf(row);
405
-
406
- console.log('Run ID clicked, row index:', rowIndex);
407
-
408
- // Try multiple ways to find the textbox
409
- let textbox = null;
410
-
411
- // Method 1: By elem_id
412
- const container1 = document.getElementById('selected_row_index');
413
- if (container1) {
414
- textbox = container1.querySelector('textarea, input[type="text"]');
415
- console.log('Method 1 (elem_id):', textbox ? 'Found' : 'Not found');
416
- }
417
-
418
- // Method 2: By data-testid
419
- if (!textbox) {
420
- const containers = document.querySelectorAll('[data-testid="textbox"]');
421
- console.log('Method 2: Found', containers.length, 'textbox containers');
422
- for (let container of containers) {
423
- const input = container.querySelector('textarea, input[type="text"]');
424
- if (input && !container.closest('.label-wrap')) {
425
- textbox = input;
426
- console.log('Method 2: Using hidden textbox');
427
- break;
428
- }
429
- }
430
- }
431
-
432
- if (textbox) {
433
- // Set the row index
434
- textbox.value = rowIndex.toString();
435
-
436
- // Trigger multiple events to ensure Gradio picks it up
437
- textbox.dispatchEvent(new Event('input', { bubbles: true }));
438
- textbox.dispatchEvent(new Event('change', { bubbles: true }));
439
- textbox.dispatchEvent(new Event('blur', { bubbles: true }));
440
-
441
- // Also try triggering on the container
442
- const container = textbox.closest('[data-testid="textbox"]');
443
- if (container) {
444
- container.dispatchEvent(new Event('input', { bubbles: true }));
445
- }
446
-
447
- console.log('Textbox updated to:', rowIndex);
448
- } else {
449
- console.error('Could not find hidden textbox!');
450
- }
451
- });
452
- });
453
- }
454
-
455
- // Attach immediately
456
- attachRowClickHandlers();
457
-
458
- // Also attach after a short delay (in case table loads async)
459
- setTimeout(attachRowClickHandlers, 500);
460
- setTimeout(attachRowClickHandlers, 1000);
461
- setTimeout(attachRowClickHandlers, 2000);
462
- })();
463
- </script>
464
  """
465
 
466
  return html
 
346
  run_id = row.get('run_id', 'N/A')
347
  run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
348
 
349
+ # Get dataset references
350
+ results_dataset = row.get('results_dataset', '')
351
+ traces_dataset = row.get('traces_dataset', '')
352
+ metrics_dataset = row.get('metrics_dataset', '')
353
+
354
  html += f"""
355
+ <tr
356
+ data-run-id="{run_id}"
357
+ data-rank="{rank}"
358
+ data-model="{model}"
359
+ data-agent-type="{agent_type}"
360
+ data-provider="{provider}"
361
+ data-success-rate="{success_rate}"
362
+ data-total-tests="{total_tests}"
363
+ data-successful-tests="{successful_tests}"
364
+ data-failed-tests="{failed_tests}"
365
+ data-avg-steps="{avg_steps}"
366
+ data-avg-duration-ms="{avg_duration_ms}"
367
+ data-total-tokens="{total_tokens}"
368
+ data-total-cost-usd="{total_cost_usd}"
369
+ data-co2-emissions-g="{co2_emissions_g}"
370
+ data-gpu-utilization-avg="{gpu_utilization_avg if pd.notna(gpu_utilization_avg) else 'None'}"
371
+ data-gpu-memory-avg-mib="{gpu_memory_avg_mib if pd.notna(gpu_memory_avg_mib) else 'None'}"
372
+ data-gpu-memory-max-mib="{gpu_memory_max_mib if pd.notna(gpu_memory_max_mib) else 'None'}"
373
+ data-gpu-temperature-avg="{gpu_temperature_avg if pd.notna(gpu_temperature_avg) else 'None'}"
374
+ data-gpu-temperature-max="{gpu_temperature_max if pd.notna(gpu_temperature_max) else 'None'}"
375
+ data-gpu-power-avg-w="{gpu_power_avg_w if pd.notna(gpu_power_avg_w) else 'None'}"
376
+ data-timestamp="{timestamp}"
377
+ data-submitted-by="{submitted_by}"
378
+ data-results-dataset="{results_dataset}"
379
+ data-traces-dataset="{traces_dataset}"
380
+ data-metrics-dataset="{metrics_dataset}"
381
+ class="tm-clickable-row"
382
+ >
383
  <td>{get_rank_badge(rank)}</td>
384
  <td class="tm-run-id" title="{run_id}">{run_id_short}</td>
385
  <td class="tm-model-name">{model}</td>
 
414
  </tbody>
415
  </table>
416
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  """
418
 
419
  return html
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  # Core UI
2
  gradio>=5.0.0
3
  gradio_client>=1.0.0 # For calling MCP server from chat screen
 
4
 
5
  # HuggingFace for dataset loading
6
  datasets>=2.14.0
 
1
  # Core UI
2
  gradio>=5.0.0
3
  gradio_client>=1.0.0 # For calling MCP server from chat screen
4
+ gradio_htmlplus # For clickable HTML table rows
5
 
6
  # HuggingFace for dataset loading
7
  datasets>=2.14.0