Spaces:
Running
feat: Complete New Evaluation screen with full navigation and MCP integration
Browse filesComplete the New Evaluation screen implementation:
Navigation:
- Added navigate_to_new_evaluation() function
- Updated all navigation handlers to include new_evaluation_screen visibility
- Added new_eval_nav_btn to all navigation outputs
- Wired up all navigation button click events
Event Handlers:
- back_to_leaderboard_from_eval_btn: Navigate back to leaderboard
- eval_estimate_btn: Estimate cost using historical data or MCP fallback
- eval_submit_btn: Submit evaluation with all configuration options
Features:
- Smart cost estimation (historical leaderboard → MCP server fallback)
- Complete 5-section configuration form
- CLI command preview on submission
- Professional success message with job details
Integration:
- Uses TraceMind MCP Server for cost estimates when no historical data
- Displays data source (📊 Historical or 🤖 MCP Estimate)
- All configurations match SMOLTRACE evaluation CLI
The New Evaluation screen is now fully functional and integrated!
|
@@ -2697,9 +2697,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2697 |
compare_screen: gr.update(visible=False),
|
| 2698 |
chat_screen: gr.update(visible=False),
|
| 2699 |
synthetic_data_screen: gr.update(visible=False),
|
| 2700 |
-
|
| 2701 |
dashboard_nav_btn: gr.update(variant="primary"),
|
| 2702 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2703 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2704 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2705 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
@@ -2718,8 +2719,30 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2718 |
compare_screen: gr.update(visible=False),
|
| 2719 |
chat_screen: gr.update(visible=False),
|
| 2720 |
synthetic_data_screen: gr.update(visible=False),
|
|
|
|
| 2721 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2722 |
leaderboard_nav_btn: gr.update(variant="primary"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2723 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2724 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2725 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
@@ -2748,8 +2771,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2748 |
compare_screen: gr.update(visible=True),
|
| 2749 |
chat_screen: gr.update(visible=False),
|
| 2750 |
synthetic_data_screen: gr.update(visible=False),
|
|
|
|
| 2751 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2752 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2753 |
compare_nav_btn: gr.update(variant="primary"),
|
| 2754 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2755 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
@@ -2767,8 +2792,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2767 |
compare_screen: gr.update(visible=True),
|
| 2768 |
chat_screen: gr.update(visible=False),
|
| 2769 |
synthetic_data_screen: gr.update(visible=False),
|
|
|
|
| 2770 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2771 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2772 |
compare_nav_btn: gr.update(variant="primary"),
|
| 2773 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2774 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
@@ -2785,8 +2812,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2785 |
compare_screen: gr.update(visible=False),
|
| 2786 |
chat_screen: gr.update(visible=True),
|
| 2787 |
synthetic_data_screen: gr.update(visible=False),
|
|
|
|
| 2788 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2789 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2790 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2791 |
chat_nav_btn: gr.update(variant="primary"),
|
| 2792 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
@@ -2803,8 +2832,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 2803 |
compare_screen: gr.update(visible=False),
|
| 2804 |
chat_screen: gr.update(visible=False),
|
| 2805 |
synthetic_data_screen: gr.update(visible=True),
|
|
|
|
| 2806 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2807 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2808 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2809 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2810 |
synthetic_data_nav_btn: gr.update(variant="primary"),
|
|
@@ -3126,15 +3157,24 @@ Result: {result}
|
|
| 3126 |
fn=navigate_to_dashboard,
|
| 3127 |
outputs=[
|
| 3128 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3129 |
-
|
|
|
|
| 3130 |
] + list(dashboard_components.values())
|
| 3131 |
)
|
| 3132 |
|
| 3133 |
leaderboard_nav_btn.click(
|
| 3134 |
fn=navigate_to_leaderboard,
|
| 3135 |
outputs=[
|
| 3136 |
-
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3137 |
-
dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3138 |
]
|
| 3139 |
)
|
| 3140 |
|
|
@@ -3142,7 +3182,8 @@ Result: {result}
|
|
| 3142 |
fn=navigate_to_compare,
|
| 3143 |
outputs=[
|
| 3144 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3145 |
-
|
|
|
|
| 3146 |
compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
|
| 3147 |
]
|
| 3148 |
)
|
|
@@ -3151,14 +3192,16 @@ Result: {result}
|
|
| 3151 |
fn=navigate_to_chat,
|
| 3152 |
outputs=[
|
| 3153 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3154 |
-
|
|
|
|
| 3155 |
]
|
| 3156 |
)
|
| 3157 |
synthetic_data_nav_btn.click(
|
| 3158 |
fn=navigate_to_synthetic_data,
|
| 3159 |
outputs=[
|
| 3160 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3161 |
-
|
|
|
|
| 3162 |
]
|
| 3163 |
)
|
| 3164 |
|
|
@@ -3175,6 +3218,38 @@ Result: {result}
|
|
| 3175 |
outputs=[push_status]
|
| 3176 |
)
|
| 3177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3178 |
# Chat screen event handlers (with streaming)
|
| 3179 |
chat_components['send_btn'].click(
|
| 3180 |
fn=on_send_message,
|
|
|
|
| 2697 |
compare_screen: gr.update(visible=False),
|
| 2698 |
chat_screen: gr.update(visible=False),
|
| 2699 |
synthetic_data_screen: gr.update(visible=False),
|
| 2700 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2701 |
dashboard_nav_btn: gr.update(variant="primary"),
|
| 2702 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2703 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2704 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2705 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2706 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2719 |
compare_screen: gr.update(visible=False),
|
| 2720 |
chat_screen: gr.update(visible=False),
|
| 2721 |
synthetic_data_screen: gr.update(visible=False),
|
| 2722 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2723 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2724 |
leaderboard_nav_btn: gr.update(variant="primary"),
|
| 2725 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2726 |
+
compare_nav_btn: gr.update(variant="secondary"),
|
| 2727 |
+
chat_nav_btn: gr.update(variant="secondary"),
|
| 2728 |
+
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
| 2729 |
+
docs_nav_btn: gr.update(variant="secondary"),
|
| 2730 |
+
}
|
| 2731 |
+
|
| 2732 |
+
def navigate_to_new_evaluation():
|
| 2733 |
+
"""Navigate to new evaluation screen"""
|
| 2734 |
+
return {
|
| 2735 |
+
dashboard_screen: gr.update(visible=False),
|
| 2736 |
+
leaderboard_screen: gr.update(visible=False),
|
| 2737 |
+
run_detail_screen: gr.update(visible=False),
|
| 2738 |
+
trace_detail_screen: gr.update(visible=False),
|
| 2739 |
+
compare_screen: gr.update(visible=False),
|
| 2740 |
+
chat_screen: gr.update(visible=False),
|
| 2741 |
+
synthetic_data_screen: gr.update(visible=False),
|
| 2742 |
+
new_evaluation_screen: gr.update(visible=True),
|
| 2743 |
+
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2744 |
+
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2745 |
+
new_eval_nav_btn: gr.update(variant="primary"),
|
| 2746 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2747 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2748 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2771 |
compare_screen: gr.update(visible=True),
|
| 2772 |
chat_screen: gr.update(visible=False),
|
| 2773 |
synthetic_data_screen: gr.update(visible=False),
|
| 2774 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2775 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2776 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2777 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2778 |
compare_nav_btn: gr.update(variant="primary"),
|
| 2779 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2780 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2792 |
compare_screen: gr.update(visible=True),
|
| 2793 |
chat_screen: gr.update(visible=False),
|
| 2794 |
synthetic_data_screen: gr.update(visible=False),
|
| 2795 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2796 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2797 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2798 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2799 |
compare_nav_btn: gr.update(variant="primary"),
|
| 2800 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2801 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2812 |
compare_screen: gr.update(visible=False),
|
| 2813 |
chat_screen: gr.update(visible=True),
|
| 2814 |
synthetic_data_screen: gr.update(visible=False),
|
| 2815 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2816 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2817 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2818 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2819 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2820 |
chat_nav_btn: gr.update(variant="primary"),
|
| 2821 |
synthetic_data_nav_btn: gr.update(variant="secondary"),
|
|
|
|
| 2832 |
compare_screen: gr.update(visible=False),
|
| 2833 |
chat_screen: gr.update(visible=False),
|
| 2834 |
synthetic_data_screen: gr.update(visible=True),
|
| 2835 |
+
new_evaluation_screen: gr.update(visible=False),
|
| 2836 |
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 2837 |
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 2838 |
+
new_eval_nav_btn: gr.update(variant="secondary"),
|
| 2839 |
compare_nav_btn: gr.update(variant="secondary"),
|
| 2840 |
chat_nav_btn: gr.update(variant="secondary"),
|
| 2841 |
synthetic_data_nav_btn: gr.update(variant="primary"),
|
|
|
|
| 3157 |
fn=navigate_to_dashboard,
|
| 3158 |
outputs=[
|
| 3159 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3160 |
+
new_evaluation_screen,
|
| 3161 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3162 |
] + list(dashboard_components.values())
|
| 3163 |
)
|
| 3164 |
|
| 3165 |
leaderboard_nav_btn.click(
|
| 3166 |
fn=navigate_to_leaderboard,
|
| 3167 |
outputs=[
|
| 3168 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
|
| 3169 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3170 |
+
]
|
| 3171 |
+
)
|
| 3172 |
+
|
| 3173 |
+
new_eval_nav_btn.click(
|
| 3174 |
+
fn=navigate_to_new_evaluation,
|
| 3175 |
+
outputs=[
|
| 3176 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
|
| 3177 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3178 |
]
|
| 3179 |
)
|
| 3180 |
|
|
|
|
| 3182 |
fn=navigate_to_compare,
|
| 3183 |
outputs=[
|
| 3184 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3185 |
+
new_evaluation_screen,
|
| 3186 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn,
|
| 3187 |
compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
|
| 3188 |
]
|
| 3189 |
)
|
|
|
|
| 3192 |
fn=navigate_to_chat,
|
| 3193 |
outputs=[
|
| 3194 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3195 |
+
new_evaluation_screen,
|
| 3196 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3197 |
]
|
| 3198 |
)
|
| 3199 |
synthetic_data_nav_btn.click(
|
| 3200 |
fn=navigate_to_synthetic_data,
|
| 3201 |
outputs=[
|
| 3202 |
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
|
| 3203 |
+
new_evaluation_screen,
|
| 3204 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3205 |
]
|
| 3206 |
)
|
| 3207 |
|
|
|
|
| 3218 |
outputs=[push_status]
|
| 3219 |
)
|
| 3220 |
|
| 3221 |
+
# New Evaluation screen event handlers
|
| 3222 |
+
back_to_leaderboard_from_eval_btn.click(
|
| 3223 |
+
fn=navigate_to_leaderboard,
|
| 3224 |
+
outputs=[
|
| 3225 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
|
| 3226 |
+
dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
|
| 3227 |
+
]
|
| 3228 |
+
)
|
| 3229 |
+
|
| 3230 |
+
eval_estimate_btn.click(
|
| 3231 |
+
fn=on_hardware_change,
|
| 3232 |
+
inputs=[eval_model, eval_hardware],
|
| 3233 |
+
outputs=[eval_cost_estimate]
|
| 3234 |
+
)
|
| 3235 |
+
|
| 3236 |
+
eval_submit_btn.click(
|
| 3237 |
+
fn=on_submit_evaluation_comprehensive,
|
| 3238 |
+
inputs=[
|
| 3239 |
+
# Infrastructure
|
| 3240 |
+
eval_infra_provider, eval_hardware,
|
| 3241 |
+
# Model Configuration
|
| 3242 |
+
eval_model, eval_provider, eval_hf_inference_provider, eval_hf_token,
|
| 3243 |
+
# Agent Configuration
|
| 3244 |
+
eval_agent_type, eval_search_provider, eval_enable_tools, eval_prompt_yml, eval_mcp_server_url, eval_additional_imports,
|
| 3245 |
+
# Test Configuration
|
| 3246 |
+
eval_dataset_name, eval_split, eval_difficulty, eval_parallel_workers, eval_working_directory,
|
| 3247 |
+
# Output & Monitoring
|
| 3248 |
+
eval_output_format, eval_output_dir, eval_enable_otel, eval_enable_gpu_metrics, eval_private, eval_debug, eval_quiet, eval_run_id
|
| 3249 |
+
],
|
| 3250 |
+
outputs=[eval_success_message]
|
| 3251 |
+
)
|
| 3252 |
+
|
| 3253 |
# Chat screen event handlers (with streaming)
|
| 3254 |
chat_components['send_btn'].click(
|
| 3255 |
fn=on_send_message,
|