Mandark-droid commited on
Commit
5c51b47
·
1 Parent(s): 3138502

Add compare screen for side-by-side run comparison

Browse files

- Implement compare screen module with two-run comparison
- Add run comparison cards with gradient styling
- Create comparison charts showing metrics side-by-side
- Generate winner summary with category breakdown
- Include radar chart for multi-dimensional comparison
- Integrate compare navigation with sidebar button
- Wire up dropdown population with available runs
- Add back-to-leaderboard navigation from compare screen

Files changed (2) hide show
  1. app.py +87 -3
  2. screens/compare.py +358 -0
app.py CHANGED
@@ -31,6 +31,10 @@ from screens.dashboard import (
31
  create_dashboard_ui,
32
  update_dashboard_data
33
  )
 
 
 
 
34
  from utils.navigation import Navigator, Screen
35
 
36
 
@@ -1114,6 +1118,9 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1114
  trace_ask_btn = gr.Button("Ask", variant="primary")
1115
  trace_answer = gr.Markdown("*Ask a question to get AI-powered insights*")
1116
 
 
 
 
1117
  # Navigation handlers (define before use)
1118
  def navigate_to_dashboard():
1119
  """Navigate to dashboard screen and load dashboard data"""
@@ -1130,6 +1137,7 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1130
  leaderboard_screen: gr.update(visible=False),
1131
  run_detail_screen: gr.update(visible=False),
1132
  trace_detail_screen: gr.update(visible=False),
 
1133
  dashboard_nav_btn: gr.update(variant="primary"),
1134
  leaderboard_nav_btn: gr.update(variant="secondary"),
1135
  compare_nav_btn: gr.update(variant="secondary"),
@@ -1145,18 +1153,59 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1145
  leaderboard_screen: gr.update(visible=True),
1146
  run_detail_screen: gr.update(visible=False),
1147
  trace_detail_screen: gr.update(visible=False),
 
1148
  dashboard_nav_btn: gr.update(variant="secondary"),
1149
  leaderboard_nav_btn: gr.update(variant="primary"),
1150
  compare_nav_btn: gr.update(variant="secondary"),
1151
  docs_nav_btn: gr.update(variant="secondary"),
1152
  }
1153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1154
  # Event handlers
1155
  # Load dashboard on app start
1156
  app.load(
1157
  fn=navigate_to_dashboard,
1158
  outputs=[
1159
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
1160
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1161
  ] + list(dashboard_components.values())
1162
  )
@@ -1249,7 +1298,7 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1249
  dashboard_nav_btn.click(
1250
  fn=navigate_to_dashboard,
1251
  outputs=[
1252
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
1253
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1254
  ] + list(dashboard_components.values())
1255
  )
@@ -1257,7 +1306,42 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1257
  leaderboard_nav_btn.click(
1258
  fn=navigate_to_leaderboard,
1259
  outputs=[
1260
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1261
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1262
  ]
1263
  )
 
31
  create_dashboard_ui,
32
  update_dashboard_data
33
  )
34
+ from screens.compare import (
35
+ create_compare_ui,
36
+ on_compare_runs
37
+ )
38
  from utils.navigation import Navigator, Screen
39
 
40
 
 
1118
  trace_ask_btn = gr.Button("Ask", variant="primary")
1119
  trace_answer = gr.Markdown("*Ask a question to get AI-powered insights*")
1120
 
1121
+ # Screen 5: Compare Screen
1122
+ compare_screen, compare_components = create_compare_ui()
1123
+
1124
  # Navigation handlers (define before use)
1125
  def navigate_to_dashboard():
1126
  """Navigate to dashboard screen and load dashboard data"""
 
1137
  leaderboard_screen: gr.update(visible=False),
1138
  run_detail_screen: gr.update(visible=False),
1139
  trace_detail_screen: gr.update(visible=False),
1140
+ compare_screen: gr.update(visible=False),
1141
  dashboard_nav_btn: gr.update(variant="primary"),
1142
  leaderboard_nav_btn: gr.update(variant="secondary"),
1143
  compare_nav_btn: gr.update(variant="secondary"),
 
1153
  leaderboard_screen: gr.update(visible=True),
1154
  run_detail_screen: gr.update(visible=False),
1155
  trace_detail_screen: gr.update(visible=False),
1156
+ compare_screen: gr.update(visible=False),
1157
  dashboard_nav_btn: gr.update(variant="secondary"),
1158
  leaderboard_nav_btn: gr.update(variant="primary"),
1159
  compare_nav_btn: gr.update(variant="secondary"),
1160
  docs_nav_btn: gr.update(variant="secondary"),
1161
  }
1162
 
1163
+ def navigate_to_compare():
1164
+ """Navigate to compare screen and populate dropdown choices"""
1165
+ try:
1166
+ leaderboard_df = data_loader.load_leaderboard()
1167
+
1168
+ # Create run choices for dropdowns (model name with run_id)
1169
+ run_choices = []
1170
+ for _, row in leaderboard_df.iterrows():
1171
+ label = f"{row.get('model', 'Unknown')} - {row.get('timestamp', 'N/A')}"
1172
+ value = row.get('run_id', '')
1173
+ if value:
1174
+ run_choices.append((label, value))
1175
+
1176
+ return {
1177
+ dashboard_screen: gr.update(visible=False),
1178
+ leaderboard_screen: gr.update(visible=False),
1179
+ run_detail_screen: gr.update(visible=False),
1180
+ trace_detail_screen: gr.update(visible=False),
1181
+ compare_screen: gr.update(visible=True),
1182
+ dashboard_nav_btn: gr.update(variant="secondary"),
1183
+ leaderboard_nav_btn: gr.update(variant="secondary"),
1184
+ compare_nav_btn: gr.update(variant="primary"),
1185
+ docs_nav_btn: gr.update(variant="secondary"),
1186
+ compare_components['compare_run_a_dropdown']: gr.update(choices=run_choices),
1187
+ compare_components['compare_run_b_dropdown']: gr.update(choices=run_choices),
1188
+ }
1189
+ except Exception as e:
1190
+ print(f"[ERROR] Navigating to compare: {e}")
1191
+ return {
1192
+ dashboard_screen: gr.update(visible=False),
1193
+ leaderboard_screen: gr.update(visible=False),
1194
+ run_detail_screen: gr.update(visible=False),
1195
+ trace_detail_screen: gr.update(visible=False),
1196
+ compare_screen: gr.update(visible=True),
1197
+ dashboard_nav_btn: gr.update(variant="secondary"),
1198
+ leaderboard_nav_btn: gr.update(variant="secondary"),
1199
+ compare_nav_btn: gr.update(variant="primary"),
1200
+ docs_nav_btn: gr.update(variant="secondary"),
1201
+ }
1202
+
1203
  # Event handlers
1204
  # Load dashboard on app start
1205
  app.load(
1206
  fn=navigate_to_dashboard,
1207
  outputs=[
1208
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
1209
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1210
  ] + list(dashboard_components.values())
1211
  )
 
1298
  dashboard_nav_btn.click(
1299
  fn=navigate_to_dashboard,
1300
  outputs=[
1301
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
1302
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1303
  ] + list(dashboard_components.values())
1304
  )
 
1306
  leaderboard_nav_btn.click(
1307
  fn=navigate_to_leaderboard,
1308
  outputs=[
1309
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
1310
+ dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1311
+ ]
1312
+ )
1313
+
1314
+ compare_nav_btn.click(
1315
+ fn=navigate_to_compare,
1316
+ outputs=[
1317
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
1318
+ dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn,
1319
+ compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
1320
+ ]
1321
+ )
1322
+
1323
+ # Compare button handler
1324
+ compare_components['compare_button'].click(
1325
+ fn=lambda run_a, run_b: on_compare_runs(run_a, run_b, leaderboard_df_cache, compare_components),
1326
+ inputs=[
1327
+ compare_components['compare_run_a_dropdown'],
1328
+ compare_components['compare_run_b_dropdown']
1329
+ ],
1330
+ outputs=[
1331
+ compare_components['comparison_output'],
1332
+ compare_components['run_a_card'],
1333
+ compare_components['run_b_card'],
1334
+ compare_components['comparison_charts'],
1335
+ compare_components['winner_summary'],
1336
+ compare_components['radar_comparison_chart']
1337
+ ]
1338
+ )
1339
+
1340
+ # Back to leaderboard from compare
1341
+ compare_components['back_to_leaderboard_btn'].click(
1342
+ fn=navigate_to_leaderboard,
1343
+ outputs=[
1344
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
1345
  dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
1346
  ]
1347
  )
screens/compare.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Compare Screen for TraceMind-AI
3
+ Side-by-side comparison of two evaluation runs
4
+ """
5
+
6
+ import gradio as gr
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+ from typing import Dict, Any
10
+
11
+
12
+ def create_run_comparison_card(run_data: Dict[str, Any], label: str) -> str:
13
+ """
14
+ Create HTML card for a run in comparison view
15
+
16
+ Args:
17
+ run_data: Dict with run information
18
+ label: "A" or "B"
19
+
20
+ Returns:
21
+ HTML string for the card
22
+ """
23
+ model = run_data.get('model', 'Unknown')
24
+ success_rate = run_data.get('success_rate', 0)
25
+ total_cost = run_data.get('total_cost_usd', 0)
26
+ duration = run_data.get('total_duration_ms', 0) / 1000 # Convert to seconds
27
+ tokens = run_data.get('total_tokens', 0)
28
+ co2 = run_data.get('co2_emissions_g', 0)
29
+
30
+ return f"""
31
+ <div style="background: linear-gradient(135deg, {'#667eea' if label == 'A' else '#764ba2'} 0%, {'#764ba2' if label == 'A' else '#f093fb'} 100%);
32
+ padding: 25px;
33
+ border-radius: 12px;
34
+ box-shadow: 0 4px 12px rgba(0,0,0,0.2);
35
+ color: white;">
36
+ <h3 style="margin-top: 0;">Run {label}: {model}</h3>
37
+
38
+ <div style="margin: 20px 0;">
39
+ <div style="display: flex; justify-content: space-between; margin: 10px 0;">
40
+ <span>Success Rate:</span>
41
+ <strong>{success_rate:.1f}%</strong>
42
+ </div>
43
+ <div style="display: flex; justify-content: space-between; margin: 10px 0;">
44
+ <span>Total Cost:</span>
45
+ <strong>${total_cost:.4f}</strong>
46
+ </div>
47
+ <div style="display: flex; justify-content: space-between; margin: 10px 0;">
48
+ <span>Duration:</span>
49
+ <strong>{duration:.2f}s</strong>
50
+ </div>
51
+ <div style="display: flex; justify-content: space-between; margin: 10px 0;">
52
+ <span>Tokens:</span>
53
+ <strong>{tokens:,}</strong>
54
+ </div>
55
+ <div style="display: flex; justify-content: space-between; margin: 10px 0;">
56
+ <span>CO2:</span>
57
+ <strong>{co2:.2f}g</strong>
58
+ </div>
59
+ </div>
60
+ </div>
61
+ """
62
+
63
+
64
+ def create_comparison_charts(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> go.Figure:
65
+ """
66
+ Create comparison charts for two runs
67
+
68
+ Args:
69
+ run_a: First run data dict
70
+ run_b: Second run data dict
71
+
72
+ Returns:
73
+ Plotly figure with comparison charts
74
+ """
75
+ try:
76
+ # Extract metrics
77
+ metrics = {
78
+ 'Success Rate (%)': [run_a.get('success_rate', 0), run_b.get('success_rate', 0)],
79
+ 'Cost ($)': [run_a.get('total_cost_usd', 0), run_b.get('total_cost_usd', 0)],
80
+ 'Duration (s)': [run_a.get('total_duration_ms', 0) / 1000, run_b.get('total_duration_ms', 0) / 1000],
81
+ 'Tokens': [run_a.get('total_tokens', 0), run_b.get('total_tokens', 0)],
82
+ 'CO2 (g)': [run_a.get('co2_emissions_g', 0), run_b.get('co2_emissions_g', 0)]
83
+ }
84
+
85
+ # Create subplots
86
+ fig = make_subplots(
87
+ rows=2, cols=3,
88
+ subplot_titles=list(metrics.keys()),
89
+ specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
90
+ [{"type": "bar"}, {"type": "bar"}, {"type": "indicator"}]],
91
+ vertical_spacing=0.15,
92
+ horizontal_spacing=0.1
93
+ )
94
+
95
+ model_a = run_a.get('model', 'Run A')
96
+ model_b = run_b.get('model', 'Run B')
97
+
98
+ # Add bar charts for each metric
99
+ positions = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)]
100
+ colors_a = ['#667eea', '#667eea', '#667eea', '#667eea', '#667eea']
101
+ colors_b = ['#764ba2', '#764ba2', '#764ba2', '#764ba2', '#764ba2']
102
+
103
+ for idx, (metric_name, values) in enumerate(metrics.items()):
104
+ if idx < 5: # First 5 metrics
105
+ row, col = positions[idx]
106
+
107
+ fig.add_trace(
108
+ go.Bar(
109
+ name=model_a,
110
+ x=[model_a],
111
+ y=[values[0]],
112
+ marker_color=colors_a[idx],
113
+ text=[f"{values[0]:.2f}"],
114
+ textposition='auto',
115
+ showlegend=(idx == 0)
116
+ ),
117
+ row=row, col=col
118
+ )
119
+
120
+ fig.add_trace(
121
+ go.Bar(
122
+ name=model_b,
123
+ x=[model_b],
124
+ y=[values[1]],
125
+ marker_color=colors_b[idx],
126
+ text=[f"{values[1]:.2f}"],
127
+ textposition='auto',
128
+ showlegend=(idx == 0)
129
+ ),
130
+ row=row, col=col
131
+ )
132
+
133
+ fig.update_layout(
134
+ height=600,
135
+ showlegend=True,
136
+ legend=dict(
137
+ orientation="h",
138
+ yanchor="bottom",
139
+ y=1.02,
140
+ xanchor="right",
141
+ x=1
142
+ ),
143
+ margin=dict(l=50, r=50, t=80, b=50)
144
+ )
145
+
146
+ return fig
147
+ except Exception as e:
148
+ print(f"[ERROR] Creating comparison charts: {e}")
149
+ fig = go.Figure()
150
+ fig.add_annotation(text=f"Error creating charts: {str(e)}", showarrow=False)
151
+ return fig
152
+
153
+
154
+ def generate_winner_summary(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> str:
155
+ """
156
+ Generate winner summary comparing two runs
157
+
158
+ Args:
159
+ run_a: First run data dict
160
+ run_b: Second run data dict
161
+
162
+ Returns:
163
+ Markdown string with winner analysis
164
+ """
165
+ model_a = run_a.get('model', 'Run A')
166
+ model_b = run_b.get('model', 'Run B')
167
+
168
+ # Compare metrics
169
+ winners = {
170
+ 'accuracy': model_a if run_a.get('success_rate', 0) > run_b.get('success_rate', 0) else model_b,
171
+ 'cost': model_a if run_a.get('total_cost_usd', 999) < run_b.get('total_cost_usd', 999) else model_b,
172
+ 'speed': model_a if run_a.get('total_duration_ms', 999999) < run_b.get('total_duration_ms', 999999) else model_b,
173
+ 'eco': model_a if run_a.get('co2_emissions_g', 999) < run_b.get('co2_emissions_g', 999) else model_b
174
+ }
175
+
176
+ # Count wins
177
+ a_wins = sum(1 for w in winners.values() if w == model_a)
178
+ b_wins = sum(1 for w in winners.values() if w == model_b)
179
+
180
+ overall_winner = model_a if a_wins > b_wins else model_b if b_wins > a_wins else "Tie"
181
+
182
+ return f"""
183
+ ### Category Winners
184
+
185
+ | Category | Winner | Metric |
186
+ |----------|--------|--------|
187
+ | **Accuracy** | **{winners['accuracy']}** | {run_a.get('success_rate', 0):.1f}% vs {run_b.get('success_rate', 0):.1f}% |
188
+ | **Cost** | **{winners['cost']}** | ${run_a.get('total_cost_usd', 0):.4f} vs ${run_b.get('total_cost_usd', 0):.4f} |
189
+ | **Speed** | **{winners['speed']}** | {run_a.get('total_duration_ms', 0)/1000:.2f}s vs {run_b.get('total_duration_ms', 0)/1000:.2f}s |
190
+ | **Eco-Friendly** | **{winners['eco']}** | {run_a.get('co2_emissions_g', 0):.2f}g vs {run_b.get('co2_emissions_g', 0):.2f}g |
191
+
192
+ ---
193
+
194
+ ### Overall Winner: **{overall_winner}**
195
+
196
+ **{model_a}** wins {a_wins} categories
197
+ **{model_b}** wins {b_wins} categories
198
+
199
+ ### Recommendation
200
+
201
+ {f"**{model_a}** is the better choice for most use cases" if a_wins > b_wins else
202
+ f"**{model_b}** is the better choice for most use cases" if b_wins > a_wins else
203
+ "Both runs are evenly matched - choose based on your specific priorities"}
204
+ """
205
+
206
+
207
+ def create_compare_ui():
208
+ """
209
+ Create the compare screen UI components
210
+
211
+ Returns:
212
+ Tuple of (screen_column, component_dict)
213
+ """
214
+ components = {}
215
+
216
+ with gr.Column(visible=False) as compare_screen:
217
+ gr.Markdown("# Compare Runs")
218
+ gr.Markdown("*Side-by-side comparison of two evaluation runs*")
219
+
220
+ with gr.Row():
221
+ components['back_to_leaderboard_btn'] = gr.Button(
222
+ "Back to Leaderboard",
223
+ variant="secondary",
224
+ size="sm"
225
+ )
226
+
227
+ gr.Markdown("## Select Runs to Compare")
228
+ with gr.Row():
229
+ with gr.Column():
230
+ components['compare_run_a_dropdown'] = gr.Dropdown(
231
+ label="Run A",
232
+ choices=[],
233
+ interactive=True
234
+ )
235
+ with gr.Column():
236
+ components['compare_run_b_dropdown'] = gr.Dropdown(
237
+ label="Run B",
238
+ choices=[],
239
+ interactive=True
240
+ )
241
+
242
+ components['compare_button'] = gr.Button(
243
+ "Compare Selected Runs",
244
+ variant="primary",
245
+ size="lg"
246
+ )
247
+
248
+ # Comparison results
249
+ with gr.Column(visible=False) as comparison_output:
250
+ gr.Markdown("## Comparison Results")
251
+
252
+ with gr.Tabs():
253
+ with gr.TabItem("Side-by-Side"):
254
+ # Side-by-side metrics
255
+ with gr.Row():
256
+ with gr.Column():
257
+ gr.Markdown("### Run A")
258
+ components['run_a_card'] = gr.HTML()
259
+ with gr.Column():
260
+ gr.Markdown("### Run B")
261
+ components['run_b_card'] = gr.HTML()
262
+
263
+ # Comparison charts
264
+ gr.Markdown("## Metric Comparisons")
265
+ components['comparison_charts'] = gr.Plot(
266
+ label="Comparison Charts",
267
+ show_label=False
268
+ )
269
+
270
+ # Winner summary
271
+ gr.Markdown("## Winner Summary")
272
+ components['winner_summary'] = gr.Markdown()
273
+
274
+ with gr.TabItem("Radar Comparison"):
275
+ gr.Markdown("""
276
+ ### Multi-Dimensional Comparison
277
+
278
+ Compare runs across **6 normalized dimensions**:
279
+ - **Success Rate**: Percentage of successful test cases
280
+ - **Speed**: Execution time (faster is better)
281
+ - **Cost Efficiency**: Dollar cost per test (cheaper is better)
282
+ - **Token Efficiency**: Success per 1000 tokens
283
+ - **CO2 Efficiency**: Environmental impact (lower is better)
284
+ - **GPU Utilization**: Resource usage (if applicable)
285
+ """)
286
+ components['radar_comparison_chart'] = gr.Plot(
287
+ label="Multi-Dimensional Radar Chart",
288
+ show_label=False
289
+ )
290
+
291
+ components['comparison_output'] = comparison_output
292
+
293
+ return compare_screen, components
294
+
295
+
296
+ def on_compare_runs(run_a_id: str, run_b_id: str, leaderboard_df, components: Dict):
297
+ """
298
+ Handle comparison of two runs
299
+
300
+ Args:
301
+ run_a_id: ID of first run
302
+ run_b_id: ID of second run
303
+ leaderboard_df: Full leaderboard dataframe
304
+ components: Dictionary of Gradio components
305
+
306
+ Returns:
307
+ Dictionary of component updates
308
+ """
309
+ try:
310
+ if not run_a_id or not run_b_id:
311
+ gr.Warning("Please select two runs to compare")
312
+ return {
313
+ components['comparison_output']: gr.update(visible=False)
314
+ }
315
+
316
+ if run_a_id == run_b_id:
317
+ gr.Warning("Please select two different runs")
318
+ return {
319
+ components['comparison_output']: gr.update(visible=False)
320
+ }
321
+
322
+ if leaderboard_df is None or leaderboard_df.empty:
323
+ gr.Warning("Leaderboard data not loaded")
324
+ return {
325
+ components['comparison_output']: gr.update(visible=False)
326
+ }
327
+
328
+ # Find the runs in the dataframe
329
+ run_a = leaderboard_df[leaderboard_df['run_id'] == run_a_id].iloc[0].to_dict()
330
+ run_b = leaderboard_df[leaderboard_df['run_id'] == run_b_id].iloc[0].to_dict()
331
+
332
+ # Create comparison visualizations
333
+ card_a = create_run_comparison_card(run_a, "A")
334
+ card_b = create_run_comparison_card(run_b, "B")
335
+ charts = create_comparison_charts(run_a, run_b)
336
+ summary = generate_winner_summary(run_a, run_b)
337
+
338
+ # Create radar chart for multi-dimensional comparison
339
+ from components.analytics_charts import create_comparison_radar
340
+ radar_chart = create_comparison_radar([run_a, run_b])
341
+
342
+ return {
343
+ components['comparison_output']: gr.update(visible=True),
344
+ components['run_a_card']: gr.update(value=card_a),
345
+ components['run_b_card']: gr.update(value=card_b),
346
+ components['comparison_charts']: gr.update(value=charts),
347
+ components['winner_summary']: gr.update(value=summary),
348
+ components['radar_comparison_chart']: gr.update(value=radar_chart)
349
+ }
350
+
351
+ except Exception as e:
352
+ print(f"[ERROR] Comparing runs: {e}")
353
+ import traceback
354
+ traceback.print_exc()
355
+ gr.Warning(f"Error comparing runs: {str(e)}")
356
+ return {
357
+ components['comparison_output']: gr.update(visible=False)
358
+ }