kshitijthakkar commited on
Commit
14c0bae
·
1 Parent(s): 7addd50

feat: Complete New Evaluation screen with full navigation and MCP integration

Browse files

Complete the New Evaluation screen implementation:

Navigation:
- Added navigate_to_new_evaluation() function
- Updated all navigation handlers to include new_evaluation_screen visibility
- Added new_eval_nav_btn to all navigation outputs
- Wired up all navigation button click events

Event Handlers:
- back_to_leaderboard_from_eval_btn: Navigate back to leaderboard
- eval_estimate_btn: Estimate cost using historical data or MCP fallback
- eval_submit_btn: Submit evaluation with all configuration options

Features:
- Smart cost estimation (historical leaderboard → MCP server fallback)
- Complete 5-section configuration form
- CLI command preview on submission
- Professional success message with job details

Integration:
- Uses TraceMind MCP Server for cost estimates when no historical data
- Displays data source (📊 Historical or 🤖 MCP Estimate)
- All configurations match SMOLTRACE evaluation CLI

The New Evaluation screen is now fully functional and integrated!

Files changed (1) hide show
  1. app.py +82 -7
app.py CHANGED
@@ -2697,9 +2697,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2697
  compare_screen: gr.update(visible=False),
2698
  chat_screen: gr.update(visible=False),
2699
  synthetic_data_screen: gr.update(visible=False),
2700
- synthetic_data_screen: gr.update(visible=False),
2701
  dashboard_nav_btn: gr.update(variant="primary"),
2702
  leaderboard_nav_btn: gr.update(variant="secondary"),
 
2703
  compare_nav_btn: gr.update(variant="secondary"),
2704
  chat_nav_btn: gr.update(variant="secondary"),
2705
  synthetic_data_nav_btn: gr.update(variant="secondary"),
@@ -2718,8 +2719,30 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2718
  compare_screen: gr.update(visible=False),
2719
  chat_screen: gr.update(visible=False),
2720
  synthetic_data_screen: gr.update(visible=False),
 
2721
  dashboard_nav_btn: gr.update(variant="secondary"),
2722
  leaderboard_nav_btn: gr.update(variant="primary"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2723
  compare_nav_btn: gr.update(variant="secondary"),
2724
  chat_nav_btn: gr.update(variant="secondary"),
2725
  synthetic_data_nav_btn: gr.update(variant="secondary"),
@@ -2748,8 +2771,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2748
  compare_screen: gr.update(visible=True),
2749
  chat_screen: gr.update(visible=False),
2750
  synthetic_data_screen: gr.update(visible=False),
 
2751
  dashboard_nav_btn: gr.update(variant="secondary"),
2752
  leaderboard_nav_btn: gr.update(variant="secondary"),
 
2753
  compare_nav_btn: gr.update(variant="primary"),
2754
  chat_nav_btn: gr.update(variant="secondary"),
2755
  synthetic_data_nav_btn: gr.update(variant="secondary"),
@@ -2767,8 +2792,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2767
  compare_screen: gr.update(visible=True),
2768
  chat_screen: gr.update(visible=False),
2769
  synthetic_data_screen: gr.update(visible=False),
 
2770
  dashboard_nav_btn: gr.update(variant="secondary"),
2771
  leaderboard_nav_btn: gr.update(variant="secondary"),
 
2772
  compare_nav_btn: gr.update(variant="primary"),
2773
  chat_nav_btn: gr.update(variant="secondary"),
2774
  synthetic_data_nav_btn: gr.update(variant="secondary"),
@@ -2785,8 +2812,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2785
  compare_screen: gr.update(visible=False),
2786
  chat_screen: gr.update(visible=True),
2787
  synthetic_data_screen: gr.update(visible=False),
 
2788
  dashboard_nav_btn: gr.update(variant="secondary"),
2789
  leaderboard_nav_btn: gr.update(variant="secondary"),
 
2790
  compare_nav_btn: gr.update(variant="secondary"),
2791
  chat_nav_btn: gr.update(variant="primary"),
2792
  synthetic_data_nav_btn: gr.update(variant="secondary"),
@@ -2803,8 +2832,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2803
  compare_screen: gr.update(visible=False),
2804
  chat_screen: gr.update(visible=False),
2805
  synthetic_data_screen: gr.update(visible=True),
 
2806
  dashboard_nav_btn: gr.update(variant="secondary"),
2807
  leaderboard_nav_btn: gr.update(variant="secondary"),
 
2808
  compare_nav_btn: gr.update(variant="secondary"),
2809
  chat_nav_btn: gr.update(variant="secondary"),
2810
  synthetic_data_nav_btn: gr.update(variant="primary"),
@@ -3126,15 +3157,24 @@ Result: {result}
3126
  fn=navigate_to_dashboard,
3127
  outputs=[
3128
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3129
- dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
 
3130
  ] + list(dashboard_components.values())
3131
  )
3132
 
3133
  leaderboard_nav_btn.click(
3134
  fn=navigate_to_leaderboard,
3135
  outputs=[
3136
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3137
- dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
 
 
 
 
 
 
 
 
3138
  ]
3139
  )
3140
 
@@ -3142,7 +3182,8 @@ Result: {result}
3142
  fn=navigate_to_compare,
3143
  outputs=[
3144
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3145
- dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn,
 
3146
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3147
  ]
3148
  )
@@ -3151,14 +3192,16 @@ Result: {result}
3151
  fn=navigate_to_chat,
3152
  outputs=[
3153
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3154
- dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
 
3155
  ]
3156
  )
3157
  synthetic_data_nav_btn.click(
3158
  fn=navigate_to_synthetic_data,
3159
  outputs=[
3160
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3161
- dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
 
3162
  ]
3163
  )
3164
 
@@ -3175,6 +3218,38 @@ Result: {result}
3175
  outputs=[push_status]
3176
  )
3177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3178
  # Chat screen event handlers (with streaming)
3179
  chat_components['send_btn'].click(
3180
  fn=on_send_message,
 
2697
  compare_screen: gr.update(visible=False),
2698
  chat_screen: gr.update(visible=False),
2699
  synthetic_data_screen: gr.update(visible=False),
2700
+ new_evaluation_screen: gr.update(visible=False),
2701
  dashboard_nav_btn: gr.update(variant="primary"),
2702
  leaderboard_nav_btn: gr.update(variant="secondary"),
2703
+ new_eval_nav_btn: gr.update(variant="secondary"),
2704
  compare_nav_btn: gr.update(variant="secondary"),
2705
  chat_nav_btn: gr.update(variant="secondary"),
2706
  synthetic_data_nav_btn: gr.update(variant="secondary"),
 
2719
  compare_screen: gr.update(visible=False),
2720
  chat_screen: gr.update(visible=False),
2721
  synthetic_data_screen: gr.update(visible=False),
2722
+ new_evaluation_screen: gr.update(visible=False),
2723
  dashboard_nav_btn: gr.update(variant="secondary"),
2724
  leaderboard_nav_btn: gr.update(variant="primary"),
2725
+ new_eval_nav_btn: gr.update(variant="secondary"),
2726
+ compare_nav_btn: gr.update(variant="secondary"),
2727
+ chat_nav_btn: gr.update(variant="secondary"),
2728
+ synthetic_data_nav_btn: gr.update(variant="secondary"),
2729
+ docs_nav_btn: gr.update(variant="secondary"),
2730
+ }
2731
+
2732
+ def navigate_to_new_evaluation():
2733
+ """Navigate to new evaluation screen"""
2734
+ return {
2735
+ dashboard_screen: gr.update(visible=False),
2736
+ leaderboard_screen: gr.update(visible=False),
2737
+ run_detail_screen: gr.update(visible=False),
2738
+ trace_detail_screen: gr.update(visible=False),
2739
+ compare_screen: gr.update(visible=False),
2740
+ chat_screen: gr.update(visible=False),
2741
+ synthetic_data_screen: gr.update(visible=False),
2742
+ new_evaluation_screen: gr.update(visible=True),
2743
+ dashboard_nav_btn: gr.update(variant="secondary"),
2744
+ leaderboard_nav_btn: gr.update(variant="secondary"),
2745
+ new_eval_nav_btn: gr.update(variant="primary"),
2746
  compare_nav_btn: gr.update(variant="secondary"),
2747
  chat_nav_btn: gr.update(variant="secondary"),
2748
  synthetic_data_nav_btn: gr.update(variant="secondary"),
 
2771
  compare_screen: gr.update(visible=True),
2772
  chat_screen: gr.update(visible=False),
2773
  synthetic_data_screen: gr.update(visible=False),
2774
+ new_evaluation_screen: gr.update(visible=False),
2775
  dashboard_nav_btn: gr.update(variant="secondary"),
2776
  leaderboard_nav_btn: gr.update(variant="secondary"),
2777
+ new_eval_nav_btn: gr.update(variant="secondary"),
2778
  compare_nav_btn: gr.update(variant="primary"),
2779
  chat_nav_btn: gr.update(variant="secondary"),
2780
  synthetic_data_nav_btn: gr.update(variant="secondary"),
 
2792
  compare_screen: gr.update(visible=True),
2793
  chat_screen: gr.update(visible=False),
2794
  synthetic_data_screen: gr.update(visible=False),
2795
+ new_evaluation_screen: gr.update(visible=False),
2796
  dashboard_nav_btn: gr.update(variant="secondary"),
2797
  leaderboard_nav_btn: gr.update(variant="secondary"),
2798
+ new_eval_nav_btn: gr.update(variant="secondary"),
2799
  compare_nav_btn: gr.update(variant="primary"),
2800
  chat_nav_btn: gr.update(variant="secondary"),
2801
  synthetic_data_nav_btn: gr.update(variant="secondary"),
 
2812
  compare_screen: gr.update(visible=False),
2813
  chat_screen: gr.update(visible=True),
2814
  synthetic_data_screen: gr.update(visible=False),
2815
+ new_evaluation_screen: gr.update(visible=False),
2816
  dashboard_nav_btn: gr.update(variant="secondary"),
2817
  leaderboard_nav_btn: gr.update(variant="secondary"),
2818
+ new_eval_nav_btn: gr.update(variant="secondary"),
2819
  compare_nav_btn: gr.update(variant="secondary"),
2820
  chat_nav_btn: gr.update(variant="primary"),
2821
  synthetic_data_nav_btn: gr.update(variant="secondary"),
 
2832
  compare_screen: gr.update(visible=False),
2833
  chat_screen: gr.update(visible=False),
2834
  synthetic_data_screen: gr.update(visible=True),
2835
+ new_evaluation_screen: gr.update(visible=False),
2836
  dashboard_nav_btn: gr.update(variant="secondary"),
2837
  leaderboard_nav_btn: gr.update(variant="secondary"),
2838
+ new_eval_nav_btn: gr.update(variant="secondary"),
2839
  compare_nav_btn: gr.update(variant="secondary"),
2840
  chat_nav_btn: gr.update(variant="secondary"),
2841
  synthetic_data_nav_btn: gr.update(variant="primary"),
 
3157
  fn=navigate_to_dashboard,
3158
  outputs=[
3159
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3160
+ new_evaluation_screen,
3161
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3162
  ] + list(dashboard_components.values())
3163
  )
3164
 
3165
  leaderboard_nav_btn.click(
3166
  fn=navigate_to_leaderboard,
3167
  outputs=[
3168
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
3169
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3170
+ ]
3171
+ )
3172
+
3173
+ new_eval_nav_btn.click(
3174
+ fn=navigate_to_new_evaluation,
3175
+ outputs=[
3176
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
3177
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3178
  ]
3179
  )
3180
 
 
3182
  fn=navigate_to_compare,
3183
  outputs=[
3184
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3185
+ new_evaluation_screen,
3186
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn,
3187
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3188
  ]
3189
  )
 
3192
  fn=navigate_to_chat,
3193
  outputs=[
3194
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3195
+ new_evaluation_screen,
3196
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3197
  ]
3198
  )
3199
  synthetic_data_nav_btn.click(
3200
  fn=navigate_to_synthetic_data,
3201
  outputs=[
3202
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3203
+ new_evaluation_screen,
3204
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3205
  ]
3206
  )
3207
 
 
3218
  outputs=[push_status]
3219
  )
3220
 
3221
+ # New Evaluation screen event handlers
3222
+ back_to_leaderboard_from_eval_btn.click(
3223
+ fn=navigate_to_leaderboard,
3224
+ outputs=[
3225
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
3226
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3227
+ ]
3228
+ )
3229
+
3230
+ eval_estimate_btn.click(
3231
+ fn=on_hardware_change,
3232
+ inputs=[eval_model, eval_hardware],
3233
+ outputs=[eval_cost_estimate]
3234
+ )
3235
+
3236
+ eval_submit_btn.click(
3237
+ fn=on_submit_evaluation_comprehensive,
3238
+ inputs=[
3239
+ # Infrastructure
3240
+ eval_infra_provider, eval_hardware,
3241
+ # Model Configuration
3242
+ eval_model, eval_provider, eval_hf_inference_provider, eval_hf_token,
3243
+ # Agent Configuration
3244
+ eval_agent_type, eval_search_provider, eval_enable_tools, eval_prompt_yml, eval_mcp_server_url, eval_additional_imports,
3245
+ # Test Configuration
3246
+ eval_dataset_name, eval_split, eval_difficulty, eval_parallel_workers, eval_working_directory,
3247
+ # Output & Monitoring
3248
+ eval_output_format, eval_output_dir, eval_enable_otel, eval_enable_gpu_metrics, eval_private, eval_debug, eval_quiet, eval_run_id
3249
+ ],
3250
+ outputs=[eval_success_message]
3251
+ )
3252
+
3253
  # Chat screen event handlers (with streaming)
3254
  chat_components['send_btn'].click(
3255
  fn=on_send_message,