kshitijthakkar commited on
Commit
abb32f1
Β·
1 Parent(s): 698e93f

feat: Add comprehensive documentation screen with 4-tab ecosystem guide

Browse files

Add new Documentation screen accessible from sidebar navigation with complete documentation for all TraceMind ecosystem components:

- About tab: Ecosystem overview, architecture diagrams, and quick start guide
- TraceVerde tab: genai_otel_instrument library documentation with installation, usage examples, and troubleshooting
- SmolTrace tab: Evaluation engine guide with CLI usage, dataset schemas, and best practices
- TraceMind-MCP-Server tab: MCP server implementation details with tool specifications and integration examples

Changes:
- Created screens/documentation.py with 1600+ lines of comprehensive docs
- Updated app.py to integrate documentation screen into navigation flow
- Added navigate_to_documentation() handler and wired up docs_nav_btn
- Updated all navigation functions to control documentation_screen visibility

Files changed (2) hide show
  1. app.py +49 -6
  2. screens/documentation.py +1606 -0
app.py CHANGED
@@ -59,6 +59,7 @@ from screens.chat import (
59
  on_clear_chat,
60
  on_quick_action
61
  )
 
62
  from screens.mcp_helpers import (
63
  call_analyze_leaderboard_sync,
64
  call_debug_trace_sync,
@@ -2413,6 +2414,11 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2413
 
2414
  eval_success_message = gr.HTML(visible=False)
2415
 
 
 
 
 
 
2416
  # ============================================================================
2417
  # Evaluation Helper Functions
2418
  # ============================================================================
@@ -2783,6 +2789,7 @@ No historical data available for **{model}**.
2783
  chat_screen: gr.update(visible=False),
2784
  synthetic_data_screen: gr.update(visible=False),
2785
  new_evaluation_screen: gr.update(visible=False),
 
2786
  dashboard_nav_btn: gr.update(variant="primary"),
2787
  leaderboard_nav_btn: gr.update(variant="secondary"),
2788
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2805,6 +2812,7 @@ No historical data available for **{model}**.
2805
  chat_screen: gr.update(visible=False),
2806
  synthetic_data_screen: gr.update(visible=False),
2807
  new_evaluation_screen: gr.update(visible=False),
 
2808
  dashboard_nav_btn: gr.update(variant="secondary"),
2809
  leaderboard_nav_btn: gr.update(variant="primary"),
2810
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2825,6 +2833,7 @@ No historical data available for **{model}**.
2825
  chat_screen: gr.update(visible=False),
2826
  synthetic_data_screen: gr.update(visible=False),
2827
  new_evaluation_screen: gr.update(visible=True),
 
2828
  dashboard_nav_btn: gr.update(variant="secondary"),
2829
  leaderboard_nav_btn: gr.update(variant="secondary"),
2830
  new_eval_nav_btn: gr.update(variant="primary"),
@@ -2857,6 +2866,7 @@ No historical data available for **{model}**.
2857
  chat_screen: gr.update(visible=False),
2858
  synthetic_data_screen: gr.update(visible=False),
2859
  new_evaluation_screen: gr.update(visible=False),
 
2860
  dashboard_nav_btn: gr.update(variant="secondary"),
2861
  leaderboard_nav_btn: gr.update(variant="secondary"),
2862
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2878,6 +2888,7 @@ No historical data available for **{model}**.
2878
  chat_screen: gr.update(visible=False),
2879
  synthetic_data_screen: gr.update(visible=False),
2880
  new_evaluation_screen: gr.update(visible=False),
 
2881
  dashboard_nav_btn: gr.update(variant="secondary"),
2882
  leaderboard_nav_btn: gr.update(variant="secondary"),
2883
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2898,6 +2909,7 @@ No historical data available for **{model}**.
2898
  chat_screen: gr.update(visible=True),
2899
  synthetic_data_screen: gr.update(visible=False),
2900
  new_evaluation_screen: gr.update(visible=False),
 
2901
  dashboard_nav_btn: gr.update(variant="secondary"),
2902
  leaderboard_nav_btn: gr.update(variant="secondary"),
2903
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2918,6 +2930,7 @@ No historical data available for **{model}**.
2918
  chat_screen: gr.update(visible=False),
2919
  synthetic_data_screen: gr.update(visible=True),
2920
  new_evaluation_screen: gr.update(visible=False),
 
2921
  dashboard_nav_btn: gr.update(variant="secondary"),
2922
  leaderboard_nav_btn: gr.update(variant="secondary"),
2923
  new_eval_nav_btn: gr.update(variant="secondary"),
@@ -2927,6 +2940,27 @@ No historical data available for **{model}**.
2927
  docs_nav_btn: gr.update(variant="secondary"),
2928
  }
2929
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2930
  # Synthetic Data Generator Callbacks
2931
  def on_generate_synthetic_data(domain, tools, num_tasks, difficulty, agent_type):
2932
  """Generate synthetic dataset using MCP server"""
@@ -3243,7 +3277,7 @@ Result: {result}
3243
  fn=navigate_to_dashboard,
3244
  outputs=[
3245
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3246
- new_evaluation_screen,
3247
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3248
  ] + list(dashboard_components.values())
3249
  )
@@ -3251,7 +3285,7 @@ Result: {result}
3251
  leaderboard_nav_btn.click(
3252
  fn=navigate_to_leaderboard,
3253
  outputs=[
3254
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
3255
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3256
  ]
3257
  )
@@ -3259,7 +3293,7 @@ Result: {result}
3259
  new_eval_nav_btn.click(
3260
  fn=navigate_to_new_evaluation,
3261
  outputs=[
3262
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen,
3263
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3264
  ]
3265
  )
@@ -3268,7 +3302,7 @@ Result: {result}
3268
  fn=navigate_to_compare,
3269
  outputs=[
3270
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3271
- new_evaluation_screen,
3272
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn,
3273
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3274
  ]
@@ -3278,7 +3312,7 @@ Result: {result}
3278
  fn=navigate_to_chat,
3279
  outputs=[
3280
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3281
- new_evaluation_screen,
3282
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3283
  ]
3284
  )
@@ -3286,7 +3320,16 @@ Result: {result}
3286
  fn=navigate_to_synthetic_data,
3287
  outputs=[
3288
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3289
- new_evaluation_screen,
 
 
 
 
 
 
 
 
 
3290
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3291
  ]
3292
  )
 
59
  on_clear_chat,
60
  on_quick_action
61
  )
62
+ from screens.documentation import create_documentation_screen
63
  from screens.mcp_helpers import (
64
  call_analyze_leaderboard_sync,
65
  call_debug_trace_sync,
 
2414
 
2415
  eval_success_message = gr.HTML(visible=False)
2416
 
2417
+ # ============================================================================
2418
+ # Screen 9: Documentation
2419
+ # ============================================================================
2420
+ documentation_screen = create_documentation_screen()
2421
+
2422
  # ============================================================================
2423
  # Evaluation Helper Functions
2424
  # ============================================================================
 
2789
  chat_screen: gr.update(visible=False),
2790
  synthetic_data_screen: gr.update(visible=False),
2791
  new_evaluation_screen: gr.update(visible=False),
2792
+ documentation_screen: gr.update(visible=False),
2793
  dashboard_nav_btn: gr.update(variant="primary"),
2794
  leaderboard_nav_btn: gr.update(variant="secondary"),
2795
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2812
  chat_screen: gr.update(visible=False),
2813
  synthetic_data_screen: gr.update(visible=False),
2814
  new_evaluation_screen: gr.update(visible=False),
2815
+ documentation_screen: gr.update(visible=False),
2816
  dashboard_nav_btn: gr.update(variant="secondary"),
2817
  leaderboard_nav_btn: gr.update(variant="primary"),
2818
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2833
  chat_screen: gr.update(visible=False),
2834
  synthetic_data_screen: gr.update(visible=False),
2835
  new_evaluation_screen: gr.update(visible=True),
2836
+ documentation_screen: gr.update(visible=False),
2837
  dashboard_nav_btn: gr.update(variant="secondary"),
2838
  leaderboard_nav_btn: gr.update(variant="secondary"),
2839
  new_eval_nav_btn: gr.update(variant="primary"),
 
2866
  chat_screen: gr.update(visible=False),
2867
  synthetic_data_screen: gr.update(visible=False),
2868
  new_evaluation_screen: gr.update(visible=False),
2869
+ documentation_screen: gr.update(visible=False),
2870
  dashboard_nav_btn: gr.update(variant="secondary"),
2871
  leaderboard_nav_btn: gr.update(variant="secondary"),
2872
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2888
  chat_screen: gr.update(visible=False),
2889
  synthetic_data_screen: gr.update(visible=False),
2890
  new_evaluation_screen: gr.update(visible=False),
2891
+ documentation_screen: gr.update(visible=False),
2892
  dashboard_nav_btn: gr.update(variant="secondary"),
2893
  leaderboard_nav_btn: gr.update(variant="secondary"),
2894
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2909
  chat_screen: gr.update(visible=True),
2910
  synthetic_data_screen: gr.update(visible=False),
2911
  new_evaluation_screen: gr.update(visible=False),
2912
+ documentation_screen: gr.update(visible=False),
2913
  dashboard_nav_btn: gr.update(variant="secondary"),
2914
  leaderboard_nav_btn: gr.update(variant="secondary"),
2915
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2930
  chat_screen: gr.update(visible=False),
2931
  synthetic_data_screen: gr.update(visible=True),
2932
  new_evaluation_screen: gr.update(visible=False),
2933
+ documentation_screen: gr.update(visible=False),
2934
  dashboard_nav_btn: gr.update(variant="secondary"),
2935
  leaderboard_nav_btn: gr.update(variant="secondary"),
2936
  new_eval_nav_btn: gr.update(variant="secondary"),
 
2940
  docs_nav_btn: gr.update(variant="secondary"),
2941
  }
2942
 
2943
+ def navigate_to_documentation():
2944
+ """Navigate to documentation screen"""
2945
+ return {
2946
+ dashboard_screen: gr.update(visible=False),
2947
+ leaderboard_screen: gr.update(visible=False),
2948
+ run_detail_screen: gr.update(visible=False),
2949
+ trace_detail_screen: gr.update(visible=False),
2950
+ compare_screen: gr.update(visible=False),
2951
+ chat_screen: gr.update(visible=False),
2952
+ synthetic_data_screen: gr.update(visible=False),
2953
+ new_evaluation_screen: gr.update(visible=False),
2954
+ documentation_screen: gr.update(visible=True),
2955
+ dashboard_nav_btn: gr.update(variant="secondary"),
2956
+ leaderboard_nav_btn: gr.update(variant="secondary"),
2957
+ new_eval_nav_btn: gr.update(variant="secondary"),
2958
+ compare_nav_btn: gr.update(variant="secondary"),
2959
+ chat_nav_btn: gr.update(variant="secondary"),
2960
+ synthetic_data_nav_btn: gr.update(variant="secondary"),
2961
+ docs_nav_btn: gr.update(variant="primary"),
2962
+ }
2963
+
2964
  # Synthetic Data Generator Callbacks
2965
  def on_generate_synthetic_data(domain, tools, num_tasks, difficulty, agent_type):
2966
  """Generate synthetic dataset using MCP server"""
 
3277
  fn=navigate_to_dashboard,
3278
  outputs=[
3279
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3280
+ new_evaluation_screen, documentation_screen,
3281
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3282
  ] + list(dashboard_components.values())
3283
  )
 
3285
  leaderboard_nav_btn.click(
3286
  fn=navigate_to_leaderboard,
3287
  outputs=[
3288
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen,
3289
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3290
  ]
3291
  )
 
3293
  new_eval_nav_btn.click(
3294
  fn=navigate_to_new_evaluation,
3295
  outputs=[
3296
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen,
3297
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3298
  ]
3299
  )
 
3302
  fn=navigate_to_compare,
3303
  outputs=[
3304
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3305
+ new_evaluation_screen, documentation_screen,
3306
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn,
3307
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3308
  ]
 
3312
  fn=navigate_to_chat,
3313
  outputs=[
3314
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3315
+ new_evaluation_screen, documentation_screen,
3316
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3317
  ]
3318
  )
 
3320
  fn=navigate_to_synthetic_data,
3321
  outputs=[
3322
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3323
+ new_evaluation_screen, documentation_screen,
3324
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3325
+ ]
3326
+ )
3327
+
3328
+ docs_nav_btn.click(
3329
+ fn=navigate_to_documentation,
3330
+ outputs=[
3331
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3332
+ new_evaluation_screen, documentation_screen,
3333
  dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn
3334
  ]
3335
  )
screens/documentation.py ADDED
@@ -0,0 +1,1606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Documentation Screen for TraceMind-AI
3
+ Comprehensive documentation for the TraceMind ecosystem
4
+ """
5
+
6
+ import gradio as gr
7
+
8
+
9
+ def create_about_tab():
10
+ """Create the About tab with ecosystem overview"""
11
+ return gr.Markdown("""
12
+ # 🧠 TraceMind Ecosystem
13
+
14
+ **The Complete AI Agent Evaluation Platform**
15
+
16
+ TraceMind is a comprehensive ecosystem for evaluating, monitoring, and optimizing AI agents. Built on open-source foundations and powered by the Model Context Protocol (MCP), TraceMind provides everything you need for production-grade agent evaluation.
17
+
18
+ ---
19
+
20
+ ## πŸ—οΈ Architecture Overview
21
+
22
+ The TraceMind ecosystem consists of four integrated components:
23
+
24
+ ```
25
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
26
+ β”‚ TraceMind Ecosystem β”‚
27
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
28
+ β”‚ β”‚
29
+ β”‚ 1️⃣ TraceVerde (genai_otel_instrument) β”‚
30
+ β”‚ └─ Automatic OpenTelemetry Instrumentation β”‚
31
+ β”‚ └─ Zero-code tracing for LLM frameworks β”‚
32
+ β”‚ β”‚
33
+ β”‚ 2️⃣ SMOLTRACE β”‚
34
+ β”‚ └─ Lightweight Agent Evaluation Engine β”‚
35
+ β”‚ └─ Generates structured datasets β”‚
36
+ β”‚ β”‚
37
+ β”‚ 3️⃣ TraceMind-MCP-Server β”‚
38
+ β”‚ └─ MCP Server (Track 1: Building MCP) β”‚
39
+ β”‚ └─ Provides intelligent analysis tools β”‚
40
+ β”‚ β”‚
41
+ β”‚ 4️⃣ TraceMind-AI (This App!) β”‚
42
+ β”‚ └─ Gradio UI (Track 2: MCP in Action) β”‚
43
+ β”‚ └─ Visualizes data + consumes MCP tools β”‚
44
+ β”‚ β”‚
45
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
46
+ ```
47
+
48
+ ---
49
+
50
+ ## πŸ”„ The Complete Flow
51
+
52
+ ### 1. **Instrument Your Agents** (TraceVerde)
53
+ ```python
54
+ from genai_otel_instrument import instrument_llm
55
+
56
+ # Zero-code instrumentation
57
+ instrument_llm(enable_content_capture=True)
58
+
59
+ # Your agent code runs normally, but now traced!
60
+ agent.run("What's the weather in Tokyo?")
61
+ ```
62
+
63
+ ### 2. **Evaluate with SMOLTRACE**
64
+ ```bash
65
+ # Run comprehensive evaluation
66
+ smoltrace-eval \\
67
+ --model openai/gpt-4 \\
68
+ --agent-type both \\
69
+ --enable-otel
70
+ ```
71
+
72
+ ### 3. **Analyze Results** (This UI)
73
+ - View leaderboard rankings
74
+ - Compare model performance
75
+ - Explore detailed traces
76
+ - Ask questions with MCP-powered chat
77
+
78
+ ---
79
+
80
+ ## 🎯 Key Features
81
+
82
+ ### For Developers
83
+ - βœ… **Zero-code Instrumentation**: Just import and go
84
+ - βœ… **Framework Agnostic**: Works with LiteLLM, Transformers, LangChain, CrewAI, etc.
85
+ - βœ… **Production Ready**: Lightweight, minimal overhead
86
+ - βœ… **Standards Compliant**: Uses OpenTelemetry conventions
87
+
88
+ ### For Researchers
89
+ - βœ… **Comprehensive Metrics**: Token usage, costs, latency, GPU utilization
90
+ - βœ… **Reproducible Results**: Structured datasets on HuggingFace
91
+ - βœ… **Model Comparison**: Side-by-side analysis
92
+ - βœ… **Trace Visualization**: Step-by-step agent execution
93
+
94
+ ### For Organizations
95
+ - βœ… **Cost Transparency**: Real-time cost tracking and estimation
96
+ - βœ… **Sustainability**: CO2 emissions monitoring (TraceVerde)
97
+ - βœ… **MCP Integration**: Connect to intelligent analysis tools
98
+ - βœ… **HuggingFace Native**: Seamless dataset integration
99
+
100
+ ---
101
+
102
+ ## πŸ† Built for MCP's 1st Birthday Hackathon
103
+
104
+ TraceMind demonstrates the complete MCP ecosystem:
105
+
106
+ **Track 1 (Building MCP)**: [TraceMind-mcp-server](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server)
107
+ - Provides MCP tools for leaderboard analysis, cost estimation, trace debugging
108
+
109
+ **Track 2 (MCP in Action)**: TraceMind-AI (this app!)
110
+ - Consumes MCP servers for autonomous agent chat and intelligent insights
111
+
112
+ ---
113
+
114
+ ## πŸ”— Quick Links
115
+
116
+ | Component | Description | Links |
117
+ |-----------|-------------|-------|
118
+ | **TraceVerde** | OTEL Instrumentation | [GitHub](https://github.com/Mandark-droid/genai_otel_instrument) β€’ [PyPI](https://pypi.org/project/genai-otel-instrument) |
119
+ | **SMOLTRACE** | Evaluation Engine | [GitHub](https://github.com/Mandark-droid/SMOLTRACE) β€’ [PyPI](https://pypi.org/project/smoltrace/) |
120
+ | **MCP Server** | Building MCP (Track 1) | [HF Space](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server) |
121
+ | **TraceMind-AI** | MCP in Action (Track 2) | [HF Space](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind) |
122
+
123
+ ---
124
+
125
+ ## πŸ“š Documentation Navigation
126
+
127
+ Use the tabs above to explore detailed documentation for each component:
128
+
129
+ - **About**: This overview (you are here)
130
+ - **TraceVerde**: OpenTelemetry instrumentation for LLMs
131
+ - **SmolTrace**: Agent evaluation engine
132
+ - **TraceMind-MCP-Server**: MCP server implementation details
133
+
134
+ ---
135
+
136
+ ## πŸ’‘ Getting Started
137
+
138
+ ### Quick Start (5 minutes)
139
+ ```bash
140
+ # 1. Install TraceVerde for instrumentation
141
+ pip install genai-otel-instrument
142
+
143
+ # 2. Install SMOLTRACE for evaluation
144
+ pip install smoltrace
145
+
146
+ # 3. Run your first evaluation
147
+ smoltrace-eval --model openai/gpt-4 --agent-type tool
148
+
149
+ # 4. View results in TraceMind-AI (this UI!)
150
+ ```
151
+
152
+ ### Learn More
153
+ - Read component-specific docs in the tabs above
154
+ - Try the **Agent Chat** for interactive queries
155
+ - Explore the **Leaderboard** to see real evaluation data
156
+ - Check the **Trace Detail** screen for deep inspection
157
+
158
+ ---
159
+
160
+ ## 🀝 Contributing
161
+
162
+ All components are open source under AGPL-3.0:
163
+ - Report issues on GitHub
164
+ - Submit pull requests
165
+ - Share your evaluation results
166
+ - Join the community discussions
167
+
168
+ ---
169
+
170
+ ## πŸ‘ Acknowledgments
171
+
172
+ Built with ❀️ for **MCP's 1st Birthday Hackathon** by **Kshitij Thakkar**
173
+
174
+ Special thanks to:
175
+ - **Anthropic** - For the Model Context Protocol
176
+ - **Gradio Team** - For Gradio 6 with MCP integration
177
+ - **HuggingFace** - For Spaces and dataset infrastructure
178
+ - **Google** - For Gemini API access
179
+ - **OpenTelemetry** - For standardized observability
180
+
181
+ ---
182
+
183
+ *Last Updated: November 2025*
184
+ """)
185
+
186
+
187
+ def create_traceverde_tab():
188
+ """Create the TraceVerde documentation tab"""
189
+ return gr.Markdown("""
190
+ # πŸ”­ TraceVerde (genai_otel_instrument)
191
+
192
+ **Automatic OpenTelemetry Instrumentation for LLM Applications**
193
+
194
+ [![GitHub](https://img.shields.io/badge/GitHub-genai__otel__instrument-black?logo=github)](https://github.com/Mandark-droid/genai_otel_instrument)
195
+ [![PyPI](https://img.shields.io/badge/PyPI-genai--otel--instrument-blue?logo=pypi)](https://pypi.org/project/genai-otel-instrument)
196
+
197
+ ---
198
+
199
+ ## What is TraceVerde?
200
+
201
+ TraceVerde is a **zero-code** OpenTelemetry instrumentation library for GenAI applications. It automatically captures:
202
+
203
+ - πŸ”Ή Every LLM call (token usage, cost, latency)
204
+ - πŸ”Ή Tool executions and results
205
+ - πŸ”Ή Agent reasoning steps
206
+ - πŸ”Ή GPU metrics (utilization, memory, temperature)
207
+ - πŸ”Ή CO2 emissions (via CodeCarbon integration)
208
+
209
+ All with **one import statement** - no code changes required!
210
+
211
+ ---
212
+
213
+ ## πŸ“¦ Installation
214
+
215
+ ```bash
216
+ pip install genai-otel-instrument
217
+
218
+ # With GPU metrics support
219
+ pip install genai-otel-instrument[gpu]
220
+
221
+ # With CO2 emissions tracking
222
+ pip install genai-otel-instrument[carbon]
223
+
224
+ # All features
225
+ pip install genai-otel-instrument[all]
226
+ ```
227
+
228
+ ---
229
+
230
+ ## πŸš€ Quick Start
231
+
232
+ ### Basic Usage
233
+
234
+ ```python
235
+ from genai_otel_instrument import instrument_llm
236
+ from opentelemetry import trace
237
+ from opentelemetry.sdk.trace import TracerProvider
238
+ from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
239
+
240
+ # 1. Setup OpenTelemetry (one-time setup)
241
+ trace.set_tracer_provider(TracerProvider())
242
+ span_processor = SimpleSpanProcessor(ConsoleSpanExporter())
243
+ trace.get_tracer_provider().add_span_processor(span_processor)
244
+
245
+ # 2. Instrument all LLM frameworks (one line!)
246
+ instrument_llm(enable_content_capture=True)
247
+
248
+ # 3. Use your LLM framework normally - it's now traced!
249
+ from litellm import completion
250
+
251
+ response = completion(
252
+ model="gpt-4",
253
+ messages=[{"role": "user", "content": "Hello!"}]
254
+ )
255
+
256
+ # Traces are automatically captured and exported!
257
+ ```
258
+
259
+ ---
260
+
261
+ ## 🎯 Supported Frameworks
262
+
263
+ TraceVerde automatically instruments:
264
+
265
+ | Framework | Status | Import Required |
266
+ |-----------|--------|-----------------|
267
+ | **LiteLLM** | βœ… Full Support | `from litellm import completion` |
268
+ | **Transformers** | βœ… Full Support | `from transformers import pipeline` |
269
+ | **LangChain** | βœ… Full Support | `from langchain import ...` |
270
+ | **CrewAI** | βœ… Full Support | `from crewai import Agent` |
271
+ | **smolagents** | βœ… Full Support | `from smolagents import ...` |
272
+ | **OpenAI SDK** | βœ… Full Support | `from openai import OpenAI` |
273
+
274
+ **No code changes needed** - just import and use as normal!
275
+
276
+ ---
277
+
278
+ ## πŸ“Š What Gets Captured?
279
+
280
+ ### LLM Spans
281
+
282
+ Every LLM call creates a span with:
283
+
284
+ ```json
285
+ {
286
+ "span_name": "LLM Call - Reasoning",
287
+ "attributes": {
288
+ "gen_ai.system": "openai",
289
+ "gen_ai.request.model": "gpt-4",
290
+ "gen_ai.operation.name": "chat",
291
+ "gen_ai.usage.prompt_tokens": 78,
292
+ "gen_ai.usage.completion_tokens": 156,
293
+ "gen_ai.usage.total_tokens": 234,
294
+ "gen_ai.usage.cost.total": 0.0012,
295
+ "gen_ai.response.finish_reasons": ["stop"],
296
+ "gen_ai.request.temperature": 0.7
297
+ }
298
+ }
299
+ ```
300
+
301
+ ### Tool Spans
302
+
303
+ Tool executions are traced with:
304
+
305
+ ```json
306
+ {
307
+ "span_name": "Tool Call - get_weather",
308
+ "attributes": {
309
+ "tool.name": "get_weather",
310
+ "tool.input": "{\\"location\\": \\"Tokyo\\"}",
311
+ "tool.output": "{\\"temp\\": \\"18Β°C\\"}",
312
+ "tool.latency_ms": 890
313
+ }
314
+ }
315
+ ```
316
+
317
+ ### GPU Metrics
318
+
319
+ When enabled, captures real-time GPU data:
320
+
321
+ ```json
322
+ {
323
+ "metrics": [
324
+ {
325
+ "name": "gen_ai.gpu.utilization",
326
+ "value": 67.5,
327
+ "unit": "%",
328
+ "timestamp": "2025-11-18T14:23:00Z"
329
+ },
330
+ {
331
+ "name": "gen_ai.gpu.memory.used",
332
+ "value": 512.34,
333
+ "unit": "MiB"
334
+ }
335
+ ]
336
+ }
337
+ ```
338
+
339
+ ---
340
+
341
+ ## 🌱 CO2 Emissions Tracking
342
+
343
+ TraceVerde integrates with CodeCarbon for sustainability monitoring:
344
+
345
+ ```python
346
+ from genai_otel_instrument import instrument_llm
347
+
348
+ # Enable CO2 tracking
349
+ instrument_llm(
350
+ enable_content_capture=True,
351
+ enable_carbon_tracking=True
352
+ )
353
+
354
+ # Your LLM calls now track carbon emissions!
355
+ ```
356
+
357
+ **Captured Metrics:**
358
+ - 🌍 CO2 emissions (grams)
359
+ - ⚑ Energy consumed (kWh)
360
+ - πŸ“ Geographic region
361
+ - πŸ’» Hardware type (CPU/GPU)
362
+
363
+ ---
364
+
365
+ ## πŸ”§ Advanced Configuration
366
+
367
+ ### Custom Exporters
368
+
369
+ ```python
370
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
371
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
372
+
373
+ # Export to Jaeger/Tempo/etc
374
+ otlp_exporter = OTLPSpanExporter(endpoint="http://localhost:4317")
375
+ span_processor = BatchSpanProcessor(otlp_exporter)
376
+ trace.get_tracer_provider().add_span_processor(span_processor)
377
+
378
+ instrument_llm(enable_content_capture=True)
379
+ ```
380
+
381
+ ### Content Capture Control
382
+
383
+ ```python
384
+ # Capture full prompts and responses (default: True)
385
+ instrument_llm(enable_content_capture=True)
386
+
387
+ # Disable for privacy/compliance
388
+ instrument_llm(enable_content_capture=False)
389
+ ```
390
+
391
+ ### GPU Metrics
392
+
393
+ ```python
394
+ # Enable GPU monitoring (requires pynvml)
395
+ instrument_llm(
396
+ enable_content_capture=True,
397
+ enable_gpu_metrics=True,
398
+ gpu_poll_interval=1.0 # seconds
399
+ )
400
+ ```
401
+
402
+ ---
403
+
404
+ ## πŸ“ˆ Integration with SMOLTRACE
405
+
406
+ TraceVerde powers SMOLTRACE's evaluation capabilities:
407
+
408
+ ```python
409
+ # SMOLTRACE automatically uses TraceVerde for instrumentation
410
+ from smoltrace import evaluate_agent
411
+
412
+ results = evaluate_agent(
413
+ model="gpt-4",
414
+ agent_type="tool",
415
+ enable_otel=True # Uses TraceVerde under the hood!
416
+ )
417
+ ```
418
+
419
+ ---
420
+
421
+ ## 🎯 Use Cases
422
+
423
+ ### 1. Development & Debugging
424
+ ```python
425
+ # See exactly what your agent is doing
426
+ instrument_llm(enable_content_capture=True)
427
+
428
+ # Run your agent
429
+ agent.run("Complex task")
430
+
431
+ # View traces in console or Jaeger
432
+ ```
433
+
434
+ ### 2. Production Monitoring
435
+ ```python
436
+ # Export to your observability platform
437
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
438
+
439
+ otlp_exporter = OTLPSpanExporter(endpoint="https://your-otel-collector")
440
+ # ... setup processor ...
441
+
442
+ instrument_llm(enable_content_capture=False) # Privacy mode
443
+ ```
444
+
445
+ ### 3. Cost Analysis
446
+ ```python
447
+ # Track costs across all LLM calls
448
+ instrument_llm(enable_content_capture=True)
449
+
450
+ # Analyze cost per user/session/feature
451
+ # All costs automatically captured in span attributes
452
+ ```
453
+
454
+ ### 4. Sustainability Reporting
455
+ ```python
456
+ # Monitor environmental impact
457
+ instrument_llm(
458
+ enable_carbon_tracking=True,
459
+ enable_gpu_metrics=True
460
+ )
461
+
462
+ # Generate CO2 reports from trace data
463
+ ```
464
+
465
+ ---
466
+
467
+ ## πŸ“ OpenTelemetry Standards
468
+
469
+ TraceVerde follows the **Gen AI Semantic Conventions**:
470
+ - βœ… Consistent attribute naming (`gen_ai.*`)
471
+ - βœ… Standard span structure
472
+ - βœ… Compatible with all OTEL collectors
473
+ - βœ… Works with Jaeger, Tempo, Datadog, New Relic, etc.
474
+
475
+ ---
476
+
477
+ ## πŸ”— Resources
478
+
479
+ - **GitHub**: [github.com/Mandark-droid/genai_otel_instrument](https://github.com/Mandark-droid/genai_otel_instrument)
480
+ - **PyPI**: [pypi.org/project/genai-otel-instrument](https://pypi.org/project/genai-otel-instrument)
481
+ - **Examples**: [github.com/Mandark-droid/genai_otel_instrument/examples](https://github.com/Mandark-droid/genai_otel_instrument/tree/main/examples)
482
+ - **OpenTelemetry Docs**: [opentelemetry.io](https://opentelemetry.io)
483
+
484
+ ---
485
+
486
+ ## πŸ› Troubleshooting
487
+
488
+ ### Common Issues
489
+
490
+ **Q: Traces not appearing?**
491
+ ```python
492
+ # Make sure you setup a tracer provider first
493
+ from opentelemetry import trace
494
+ from opentelemetry.sdk.trace import TracerProvider
495
+
496
+ trace.set_tracer_provider(TracerProvider())
497
+ ```
498
+
499
+ **Q: GPU metrics not working?**
500
+ ```bash
501
+ # Install GPU support
502
+ pip install genai-otel-instrument[gpu]
503
+
504
+ # Verify NVIDIA drivers installed
505
+ nvidia-smi
506
+ ```
507
+
508
+ **Q: Content capture not working?**
509
+ ```python
510
+ # Explicitly enable content capture
511
+ instrument_llm(enable_content_capture=True)
512
+ ```
513
+
514
+ ---
515
+
516
+ ## πŸ“„ License
517
+
518
+ **AGPL-3.0** - Open source and free to use
519
+
520
+ ---
521
+
522
+ ## 🀝 Contributing
523
+
524
+ Contributions welcome!
525
+ - Report bugs on GitHub Issues
526
+ - Submit PRs for new framework support
527
+ - Share your use cases
528
+
529
+ ---
530
+
531
+ *TraceVerde - Making AI agents observable, one trace at a time* πŸ”­
532
+ """)
533
+
534
+
535
+ def create_smoltrace_tab():
536
+ """Create the SMOLTRACE documentation tab"""
537
+ return gr.Markdown("""
538
+ # πŸ“Š SMOLTRACE
539
+
540
+ **Lightweight Agent Evaluation Engine with Built-in OpenTelemetry Tracing**
541
+
542
+ [![GitHub](https://img.shields.io/badge/GitHub-SMOLTRACE-black?logo=github)](https://github.com/Mandark-droid/SMOLTRACE)
543
+ [![PyPI](https://img.shields.io/badge/PyPI-smoltrace-blue?logo=pypi)](https://pypi.org/project/smoltrace/)
544
+
545
+ ---
546
+
547
+ ## What is SMOLTRACE?
548
+
549
+ SMOLTRACE is a **production-ready** evaluation framework for AI agents that:
550
+
551
+ - βœ… Evaluates agents across tool usage, code execution, and both
552
+ - βœ… Supports both API models (via LiteLLM) and local models (via Transformers)
553
+ - βœ… Automatically captures OpenTelemetry traces using TraceVerde
554
+ - βœ… Generates structured datasets for HuggingFace
555
+ - βœ… Tracks costs, GPU metrics, and CO2 emissions
556
+
557
+ **Goal**: Become HuggingFace's standard agent evaluation platform
558
+
559
+ ---
560
+
561
+ ## πŸ“¦ Installation
562
+
563
+ ```bash
564
+ # Basic installation
565
+ pip install smoltrace
566
+
567
+ # With OpenTelemetry support
568
+ pip install smoltrace[otel]
569
+
570
+ # With GPU metrics
571
+ pip install smoltrace[otel,gpu]
572
+
573
+ # Everything
574
+ pip install smoltrace[all]
575
+ ```
576
+
577
+ ---
578
+
579
+ ## πŸš€ Quick Start
580
+
581
+ ### Command Line
582
+
583
+ ```bash
584
+ # Evaluate GPT-4 as a tool agent
585
+ smoltrace-eval \\
586
+ --model openai/gpt-4 \\
587
+ --provider litellm \\
588
+ --agent-type tool \\
589
+ --enable-otel
590
+
591
+ # Evaluate local Llama model
592
+ smoltrace-eval \\
593
+ --model meta-llama/Llama-3.1-8B \\
594
+ --provider transformers \\
595
+ --agent-type both \\
596
+ --enable-otel \\
597
+ --enable-gpu-metrics
598
+ ```
599
+
600
+ ### Python API
601
+
602
+ ```python
603
+ from smoltrace import evaluate_agent
604
+
605
+ # Run evaluation
606
+ results = evaluate_agent(
607
+ model="openai/gpt-4",
608
+ provider="litellm",
609
+ agent_type="tool",
610
+ enable_otel=True,
611
+ num_tests=100
612
+ )
613
+
614
+ # Access results
615
+ print(f"Success Rate: {results.success_rate}%")
616
+ print(f"Total Cost: ${results.total_cost}")
617
+ print(f"Avg Duration: {results.avg_duration_ms}ms")
618
+
619
+ # Upload to HuggingFace
620
+ results.upload_to_hf(
621
+ results_repo="username/agent-results-gpt4",
622
+ traces_repo="username/agent-traces-gpt4",
623
+ leaderboard_repo="username/agent-leaderboard"
624
+ )
625
+ ```
626
+
627
+ ---
628
+
629
+ ## 🎯 Evaluation Types
630
+
631
+ ### 1. Tool Agent
632
+ Tests ability to use external tools:
633
+ ```bash
634
+ smoltrace-eval --model gpt-4 --agent-type tool
635
+ ```
636
+
637
+ **Example Task**: "What's the weather in Tokyo?"
638
+ - Agent must call `get_weather` tool
639
+ - Verify correct tool selection
640
+ - Check response quality
641
+
642
+ ### 2. Code Agent
643
+ Tests code generation and execution:
644
+ ```bash
645
+ smoltrace-eval --model gpt-4 --agent-type code
646
+ ```
647
+
648
+ **Example Task**: "Calculate the sum of first 10 prime numbers"
649
+ - Agent must generate Python code
650
+ - Execute code safely
651
+ - Return correct result
652
+
653
+ ### 3. Both (Combined)
654
+ Tests comprehensive agent capabilities:
655
+ ```bash
656
+ smoltrace-eval --model gpt-4 --agent-type both
657
+ ```
658
+
659
+ **Tests both tool usage AND code generation**
660
+
661
+ ---
662
+
663
+ ## πŸ“Š What Gets Generated?
664
+
665
+ SMOLTRACE creates **4 structured datasets** on HuggingFace:
666
+
667
+ ### 1. Leaderboard Dataset
668
+ Aggregate statistics for all evaluation runs:
669
+
670
+ ```python
671
+ {
672
+ "run_id": "uuid",
673
+ "model": "openai/gpt-4",
674
+ "agent_type": "tool",
675
+ "provider": "litellm",
676
+
677
+ # Performance
678
+ "success_rate": 95.8,
679
+ "total_tests": 100,
680
+ "avg_duration_ms": 3200.0,
681
+
682
+ # Cost & Resources
683
+ "total_tokens": 15000,
684
+ "total_cost_usd": 0.05,
685
+ "co2_emissions_g": 0.22,
686
+ "gpu_utilization_avg": 67.5,
687
+
688
+ # Dataset References
689
+ "results_dataset": "username/agent-results-gpt4",
690
+ "traces_dataset": "username/agent-traces-gpt4",
691
+ "metrics_dataset": "username/agent-metrics-gpt4",
692
+
693
+ # Metadata
694
+ "timestamp": "2025-11-18T14:23:00Z",
695
+ "submitted_by": "username"
696
+ }
697
+ ```
698
+
699
+ ### 2. Results Dataset
700
+ Individual test case results:
701
+
702
+ ```python
703
+ {
704
+ "run_id": "uuid",
705
+ "task_id": "task_001",
706
+ "test_index": 0,
707
+
708
+ # Test Case
709
+ "prompt": "What's the weather in Tokyo?",
710
+ "expected_tool": "get_weather",
711
+
712
+ # Result
713
+ "success": true,
714
+ "response": "The weather in Tokyo is 18Β°C and clear.",
715
+ "tool_called": "get_weather",
716
+
717
+ # Metrics
718
+ "execution_time_ms": 2450.0,
719
+ "total_tokens": 234,
720
+ "cost_usd": 0.0012,
721
+
722
+ # Trace Reference
723
+ "trace_id": "trace_abc123"
724
+ }
725
+ ```
726
+
727
+ ### 3. Traces Dataset
728
+ Full OpenTelemetry traces:
729
+
730
+ ```python
731
+ {
732
+ "trace_id": "trace_abc123",
733
+ "run_id": "uuid",
734
+ "spans": [
735
+ {
736
+ "spanId": "span_001",
737
+ "name": "Agent Execution",
738
+ "startTime": "2025-11-18T14:23:01.000Z",
739
+ "endTime": "2025-11-18T14:23:03.450Z",
740
+ "attributes": {
741
+ "agent.type": "tool",
742
+ "gen_ai.system": "openai",
743
+ "gen_ai.request.model": "gpt-4"
744
+ }
745
+ },
746
+ # ... more spans ...
747
+ ]
748
+ }
749
+ ```
750
+
751
+ ### 4. Metrics Dataset
752
+ GPU metrics and performance data:
753
+
754
+ ```python
755
+ {
756
+ "run_id": "uuid",
757
+ "trace_id": "trace_abc123",
758
+ "metrics": [
759
+ {
760
+ "name": "gen_ai.gpu.utilization",
761
+ "value": 67.5,
762
+ "unit": "%",
763
+ "timestamp": "2025-11-18T14:23:01.000Z"
764
+ },
765
+ {
766
+ "name": "gen_ai.co2.emissions",
767
+ "value": 0.22,
768
+ "unit": "gCO2e"
769
+ }
770
+ ]
771
+ }
772
+ ```
773
+
774
+ ---
775
+
776
+ ## πŸ”§ Configuration Options
777
+
778
+ ### Model Selection
779
+
780
+ ```bash
781
+ # API Models (via LiteLLM)
782
+ --model openai/gpt-4
783
+ --model anthropic/claude-3-5-sonnet
784
+ --model google/gemini-pro
785
+
786
+ # Local Models (via Transformers)
787
+ --model meta-llama/Llama-3.1-8B
788
+ --model mistralai/Mistral-7B-v0.1
789
+ ```
790
+
791
+ ### Provider Selection
792
+
793
+ ```bash
794
+ --provider litellm # For API models
795
+ --provider transformers # For local models
796
+ ```
797
+
798
+ ### Hardware Selection
799
+
800
+ ```bash
801
+ # Automatic (default)
802
+ # API models β†’ CPU
803
+ # Local models β†’ GPU if available
804
+
805
+ # Manual override
806
+ --hardware cpu
807
+ --hardware gpu_a10
808
+ --hardware gpu_h200
809
+ ```
810
+
811
+ ### OpenTelemetry Options
812
+
813
+ ```bash
814
+ --enable-otel # Enable tracing
815
+ --enable-gpu-metrics # Capture GPU data
816
+ --enable-carbon-tracking # Track CO2 emissions
817
+ ```
818
+
819
+ ---
820
+
821
+ ## πŸ—οΈ Integration with HuggingFace Jobs
822
+
823
+ SMOLTRACE works seamlessly with HuggingFace Jobs:
824
+
825
+ ```yaml
826
+ # job.yaml
827
+ name: SMOLTRACE Evaluation
828
+ hardware: gpu-h200
829
+ environment:
830
+ MODEL: meta-llama/Llama-3.1-8B
831
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
832
+ command: |
833
+ pip install smoltrace[otel,gpu]
834
+ smoltrace-eval \\
835
+ --model $MODEL \\
836
+ --provider transformers \\
837
+ --agent-type both \\
838
+ --enable-otel \\
839
+ --enable-gpu-metrics \\
840
+ --results-repo ${{ username }}/agent-results \\
841
+ --leaderboard-repo huggingface/smolagents-leaderboard
842
+ ```
843
+
844
+ **Benefits:**
845
+ - πŸ’° **H200 GPUs**: 2x faster evaluation
846
+ - πŸ“Š **Automatic Upload**: Results β†’ HuggingFace datasets
847
+ - πŸ”„ **Reproducible**: Same environment every time
848
+
849
+ ---
850
+
851
+ ## πŸ“ˆ Integration with TraceMind-AI
852
+
853
+ SMOLTRACE datasets power the TraceMind-AI interface:
854
+
855
+ ```
856
+ SMOLTRACE Evaluation
857
+ ↓
858
+ 4 Datasets Created
859
+ ↓
860
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”
861
+ β”‚ β”‚
862
+ β”‚ TraceMind-AI β”‚ ← You are here!
863
+ β”‚ (Gradio UI) β”‚
864
+ β”‚ β”‚
865
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
866
+ ```
867
+
868
+ **What TraceMind-AI Shows:**
869
+ - πŸ“Š **Leaderboard**: All evaluation runs
870
+ - πŸ” **Run Detail**: Individual test cases
871
+ - πŸ•΅οΈ **Trace Detail**: OpenTelemetry visualization
872
+ - πŸ€– **Agent Chat**: MCP-powered analysis
873
+
874
+ ---
875
+
876
+ ## 🎯 Best Practices
877
+
878
+ ### 1. Start Small
879
+ ```bash
880
+ # Test with 10 runs first
881
+ smoltrace-eval --model gpt-4 --num-tests 10
882
+
883
+ # Scale up after validation
884
+ smoltrace-eval --model gpt-4 --num-tests 1000
885
+ ```
886
+
887
+ ### 2. Use Appropriate Hardware
888
+ ```bash
889
+ # API models β†’ CPU (no GPU needed)
890
+ smoltrace-eval --model openai/gpt-4 --hardware cpu
891
+
892
+ # Local models β†’ GPU (faster)
893
+ smoltrace-eval --model meta-llama/Llama-3.1-8B --hardware gpu_h200
894
+ ```
895
+
896
+ ### 3. Enable Full Observability
897
+ ```bash
898
+ # Capture everything
899
+ smoltrace-eval \\
900
+ --model your-model \\
901
+ --enable-otel \\
902
+ --enable-gpu-metrics \\
903
+ --enable-carbon-tracking
904
+ ```
905
+
906
+ ### 4. Organize Your Datasets
907
+ ```bash
908
+ # Use descriptive repo names
909
+ --results-repo username/results-gpt4-tool-20251118
910
+ --traces-repo username/traces-gpt4-tool-20251118
911
+ --leaderboard-repo username/agent-leaderboard
912
+ ```
913
+
914
+ ---
915
+
916
+ ## πŸ” Cost Estimation
917
+
918
+ Before running evaluations, estimate costs:
919
+
920
+ ```python
921
+ from smoltrace import estimate_cost
922
+
923
+ # API model
924
+ api_cost = estimate_cost(
925
+ model="openai/gpt-4",
926
+ num_tests=1000,
927
+ agent_type="tool"
928
+ )
929
+ print(f"Estimated cost: ${api_cost.total_cost}")
930
+
931
+ # GPU job
932
+ gpu_cost = estimate_cost(
933
+ model="meta-llama/Llama-3.1-8B",
934
+ num_tests=1000,
935
+ hardware="gpu_h200"
936
+ )
937
+ print(f"Estimated cost: ${gpu_cost.total_cost}")
938
+ print(f"Estimated time: {gpu_cost.duration_minutes} minutes")
939
+ ```
940
+
941
+ ---
942
+
943
+ ## πŸ“ Architecture
944
+
945
+ ```
946
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
947
+ β”‚ SMOLTRACE Core β”‚
948
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
949
+ β”‚ β”‚
950
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
951
+ β”‚ β”‚ LiteLLM β”‚ β”‚ Transformers β”‚ β”‚
952
+ β”‚ β”‚ Provider β”‚ β”‚ Provider β”‚ β”‚
953
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
954
+ β”‚ β”‚ β”‚ β”‚
955
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
956
+ β”‚ ↓ β”‚
957
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
958
+ β”‚ β”‚ TraceVerde β”‚ β”‚
959
+ β”‚ β”‚ (OTEL) β”‚ β”‚
960
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
961
+ β”‚ ↓ β”‚
962
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
963
+ β”‚ β”‚ Dataset β”‚ β”‚
964
+ οΏ½οΏ½οΏ½ β”‚ Generator β”‚ β”‚
965
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
966
+ β”‚ ↓ β”‚
967
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
968
+ β”‚ β”‚ HuggingFace Upload β”‚ β”‚
969
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
970
+ β”‚ β”‚
971
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
972
+ ```
973
+
974
+ ---
975
+
976
+ ## πŸ”— Resources
977
+
978
+ - **GitHub**: [github.com/Mandark-droid/SMOLTRACE](https://github.com/Mandark-droid/SMOLTRACE)
979
+ - **PyPI**: [pypi.org/project/smoltrace](https://pypi.org/project/smoltrace/)
980
+ - **Examples**: [github.com/Mandark-droid/SMOLTRACE/examples](https://github.com/Mandark-droid/SMOLTRACE/tree/main/examples)
981
+ - **Dataset Schema**: [github.com/Mandark-droid/SMOLTRACE/docs/schema.md](https://github.com/Mandark-droid/SMOLTRACE/blob/main/docs/schema.md)
982
+
983
+ ---
984
+
985
+ ## πŸ› Troubleshooting
986
+
987
+ ### Common Issues
988
+
989
+ **Q: Evaluation is slow?**
990
+ ```bash
991
+ # Use GPU for local models
992
+ --hardware gpu_h200
993
+
994
+ # Or reduce test count
995
+ --num-tests 10
996
+ ```
997
+
998
+ **Q: Traces not captured?**
999
+ ```bash
1000
+ # Make sure OTEL is enabled
1001
+ --enable-otel
1002
+ ```
1003
+
1004
+ **Q: Upload to HF failing?**
1005
+ ```bash
1006
+ # Check HF token
1007
+ export HF_TOKEN=your_token_here
1008
+
1009
+ # Verify repo exists or allow auto-create
1010
+ ```
1011
+
1012
+ ---
1013
+
1014
+ ## πŸ“„ License
1015
+
1016
+ **AGPL-3.0** - Open source and free to use
1017
+
1018
+ ---
1019
+
1020
+ ## 🀝 Contributing
1021
+
1022
+ We welcome contributions!
1023
+ - Add new agent types
1024
+ - Support more frameworks
1025
+ - Improve evaluation metrics
1026
+ - Optimize performance
1027
+
1028
+ ---
1029
+
1030
+ *SMOLTRACE - Lightweight evaluation for heavyweight results* πŸ“Š
1031
+ """)
1032
+
1033
+
1034
+ def create_mcp_server_tab():
1035
+ """Create the TraceMind-MCP-Server documentation tab"""
1036
+ return gr.Markdown("""
1037
+ # πŸ”Œ TraceMind-MCP-Server
1038
+
1039
+ **Building MCP: Intelligent Analysis Tools for Agent Evaluation**
1040
+
1041
+ [![HF Space](https://img.shields.io/badge/HuggingFace-TraceMind--MCP--Server-yellow?logo=huggingface)](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server)
1042
+ [![Track 1](https://img.shields.io/badge/Track-Building%20MCP%20(Enterprise)-blue)](https://github.com/modelcontextprotocol/hackathon)
1043
+
1044
+ ---
1045
+
1046
+ ## What is TraceMind-MCP-Server?
1047
+
1048
+ TraceMind-MCP-Server is a **Track 1 (Building MCP)** submission that provides MCP tools for intelligent agent evaluation analysis.
1049
+
1050
+ **Key Features:**
1051
+ - πŸ€– Powered by Google Gemini 2.5 Pro
1052
+ - πŸ”Œ Standards-compliant MCP implementation
1053
+ - πŸ“Š Analyzes HuggingFace evaluation datasets
1054
+ - πŸ’‘ Provides actionable insights and recommendations
1055
+ - 🌐 Accessible via SSE transport for Gradio integration
1056
+
1057
+ ---
1058
+
1059
+ ## πŸ› οΈ MCP Tools Provided
1060
+
1061
+ ### 1. `analyze_leaderboard`
1062
+
1063
+ **Purpose**: Generate AI-powered insights about evaluation leaderboard data
1064
+
1065
+ **Input Schema:**
1066
+ ```json
1067
+ {
1068
+ "leaderboard_repo": "string", // HF dataset (default: kshitijthakkar/smoltrace-leaderboard)
1069
+ "metric_focus": "string", // "overall" | "accuracy" | "cost" | "latency" | "co2"
1070
+ "time_range": "string", // "last_week" | "last_month" | "all_time"
1071
+ "top_n": "integer" // Number of top models to highlight
1072
+ }
1073
+ ```
1074
+
1075
+ **What It Does:**
1076
+ 1. Fetches leaderboard dataset from HuggingFace
1077
+ 2. Filters by time range
1078
+ 3. Analyzes trends based on metric focus
1079
+ 4. Uses Gemini to generate insights
1080
+ 5. Returns markdown-formatted analysis
1081
+
1082
+ **Example Output:**
1083
+ ```markdown
1084
+ Based on 247 evaluations in the past week:
1085
+
1086
+ **Top Performers:**
1087
+ - GPT-4 leads in accuracy at 95.8% but costs $0.05 per run
1088
+ - Llama-3.1-8B offers best cost/performance at 93.4% accuracy for $0.002
1089
+ - Qwen3-MoE is fastest at 1.7s average duration
1090
+
1091
+ **Trends:**
1092
+ - API models dominate accuracy rankings
1093
+ - GPU models are 10x more cost-effective
1094
+ - H200 jobs show 2x faster execution vs A10
1095
+
1096
+ **Recommendations:**
1097
+ - For production: Consider Llama-3.1-8B for cost-sensitive workloads
1098
+ - For maximum accuracy: GPT-4 remains state-of-the-art
1099
+ - For eco-friendly: Claude-3-Haiku has lowest CO2 emissions
1100
+ ```
1101
+
1102
+ ---
1103
+
1104
+ ### 2. `estimate_cost`
1105
+
1106
+ **Purpose**: Estimate evaluation costs with hardware recommendations
1107
+
1108
+ **Input Schema:**
1109
+ ```json
1110
+ {
1111
+ "model": "string", // Model name (e.g., "openai/gpt-4")
1112
+ "agent_type": "string", // "tool" | "code" | "both"
1113
+ "num_tests": "integer", // Number of test cases (default: 100)
1114
+ "hardware": "string" // "cpu" | "gpu_a10" | "gpu_h200" (optional)
1115
+ }
1116
+ ```
1117
+
1118
+ **What It Does:**
1119
+ 1. Determines if model is API or local
1120
+ 2. Calculates token usage estimates
1121
+ 3. Computes costs (API pricing or GPU time)
1122
+ 4. Estimates duration and CO2 emissions
1123
+ 5. Provides hardware recommendations
1124
+
1125
+ **Example Output:**
1126
+ ```markdown
1127
+ ## Cost Estimation: openai/gpt-4 (Tool Agent, 100 tests)
1128
+
1129
+ **Hardware**: CPU (API model)
1130
+
1131
+ **Cost Breakdown:**
1132
+ - Total Tokens: ~15,000
1133
+ - Prompt Tokens: ~5,000 ($0.03)
1134
+ - Completion Tokens: ~10,000 ($0.06)
1135
+ - **Total Cost: $0.09**
1136
+
1137
+ **Time Estimate:**
1138
+ - Average per test: 3.2s
1139
+ - Total duration: ~5.3 minutes
1140
+
1141
+ **CO2 Emissions:**
1142
+ - Estimated: 0.45g CO2e
1143
+
1144
+ **Recommendations:**
1145
+ - βœ… Good choice for accuracy-critical applications
1146
+ - ⚠️ Consider Llama-3.1-8B for cost savings (10x cheaper)
1147
+ - πŸ’‘ Use caching to reduce repeated API calls
1148
+ ```
1149
+
1150
+ ---
1151
+
1152
+ ### 3. `debug_trace`
1153
+
1154
+ **Purpose**: Answer questions about agent execution traces
1155
+
1156
+ **Input Schema:**
1157
+ ```json
1158
+ {
1159
+ "trace_dataset": "string", // HF dataset with OTEL traces
1160
+ "trace_id": "string", // Specific trace to analyze
1161
+ "question": "string", // Question about the trace
1162
+ "include_metrics": "boolean" // Include GPU metrics (default: true)
1163
+ }
1164
+ ```
1165
+
1166
+ **What It Does:**
1167
+ 1. Fetches trace data from HuggingFace
1168
+ 2. Parses OpenTelemetry spans
1169
+ 3. Analyzes execution flow
1170
+ 4. Uses Gemini to answer questions
1171
+ 5. Provides span-level details
1172
+
1173
+ **Example Output:**
1174
+ ```markdown
1175
+ ## Why was the tool called twice?
1176
+
1177
+ Based on trace analysis for `trace_abc123`:
1178
+
1179
+ **First Tool Call (span_003)**:
1180
+ - Time: 14:23:19.000
1181
+ - Tool: `search_web`
1182
+ - Input: {"query": "latest AI news"}
1183
+ - Result: 5 results returned
1184
+ - Issue: Results were 2 days old
1185
+
1186
+ **Second Tool Call (span_005)**:
1187
+ - Time: 14:23:21.200
1188
+ - Tool: `search_web`
1189
+ - Input: {"query": "latest AI news today"}
1190
+ - Reasoning: LLM determined first results were outdated
1191
+ - Duration: 1200ms
1192
+
1193
+ **Why Twice?**
1194
+ The agent's reasoning chain shows it initially received outdated results.
1195
+ The LLM then decided to refine the query with "today" keyword to get
1196
+ more recent data.
1197
+
1198
+ **Performance Impact:**
1199
+ - Added 2.09s to total execution
1200
+ - Cost increase: +$0.0003
1201
+ - This is normal for agents with iterative reasoning
1202
+
1203
+ **Recommendation:**
1204
+ Consider adding date filters to initial tool calls to avoid retries.
1205
+ ```
1206
+
1207
+ ---
1208
+
1209
+ ### 4. `compare_runs`
1210
+
1211
+ **Purpose**: Side-by-side comparison of evaluation runs
1212
+
1213
+ **Input Schema:**
1214
+ ```json
1215
+ {
1216
+ "leaderboard_repo": "string", // HF leaderboard dataset
1217
+ "run_id_1": "string", // First run ID
1218
+ "run_id_2": "string", // Second run ID
1219
+ "comparison_focus": "string" // "overall" | "cost" | "accuracy" | "speed"
1220
+ }
1221
+ ```
1222
+
1223
+ **What It Does:**
1224
+ 1. Fetches data for both runs
1225
+ 2. Compares key metrics
1226
+ 3. Identifies strengths/weaknesses
1227
+ 4. Provides recommendations
1228
+
1229
+ **Example Output:**
1230
+ ```markdown
1231
+ ## Comparison: GPT-4 vs Llama-3.1-8B
1232
+
1233
+ | Metric | GPT-4 | Llama-3.1-8B | Winner |
1234
+ |--------|-------|--------------|--------|
1235
+ | Success Rate | 95.8% | 93.4% | GPT-4 (+2.4%) |
1236
+ | Avg Duration | 3.2s | 2.1s | Llama (+34% faster) |
1237
+ | Cost per Run | $0.05 | $0.002 | Llama (25x cheaper) |
1238
+ | CO2 Emissions | 0.22g | 0.08g | Llama (64% less) |
1239
+
1240
+ **Analysis:**
1241
+ - GPT-4 has slight accuracy edge but at significant cost premium
1242
+ - Llama-3.1-8B offers excellent cost/performance ratio
1243
+ - For 1000 runs: GPT-4 costs $50, Llama costs $2
1244
+
1245
+ **Recommendation:**
1246
+ Use Llama-3.1-8B for production unless 95%+ accuracy is critical.
1247
+ Consider hybrid approach: Llama for routine tasks, GPT-4 for complex ones.
1248
+ ```
1249
+
1250
+ ---
1251
+
1252
+ ### 5. `analyze_results`
1253
+
1254
+ **Purpose**: Deep dive into test case results
1255
+
1256
+ **Input Schema:**
1257
+ ```json
1258
+ {
1259
+ "results_repo": "string", // HF results dataset
1260
+ "run_id": "string", // Run to analyze
1261
+ "focus": "string" // "failures" | "successes" | "all"
1262
+ }
1263
+ ```
1264
+
1265
+ **What It Does:**
1266
+ 1. Loads results dataset
1267
+ 2. Filters by success/failure
1268
+ 3. Identifies patterns
1269
+ 4. Suggests optimizations
1270
+
1271
+ ---
1272
+
1273
+ ## 🌐 Accessing the MCP Server
1274
+
1275
+ ### Via TraceMind-AI (This App!)
1276
+
1277
+ The **Agent Chat** screen uses TraceMind-MCP-Server automatically:
1278
+
1279
+ ```python
1280
+ # Happens automatically in the Chat screen
1281
+ from mcp_client.sync_wrapper import get_sync_mcp_client
1282
+
1283
+ mcp = get_sync_mcp_client()
1284
+ insights = mcp.analyze_leaderboard(
1285
+ metric_focus="overall",
1286
+ time_range="last_week"
1287
+ )
1288
+ ```
1289
+
1290
+ ### Via SSE Endpoint (for smolagents)
1291
+
1292
+ ```python
1293
+ from smolagents import MCPClient, ToolCallingAgent
1294
+
1295
+ # Connect to MCP server via SSE
1296
+ mcp_client = MCPClient(
1297
+ "https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse"
1298
+ )
1299
+
1300
+ # Create agent with MCP tools
1301
+ agent = ToolCallingAgent(
1302
+ tools=[],
1303
+ model="hfapi",
1304
+ additional_authorized_imports=["requests", "pandas"]
1305
+ )
1306
+
1307
+ # Tools automatically available!
1308
+ agent.run("Analyze the leaderboard and show top 3 models")
1309
+ ```
1310
+
1311
+ ### Via MCP SDK (for other clients)
1312
+
1313
+ ```python
1314
+ from mcp import ClientSession, StdioServerParameters
1315
+
1316
+ # For local development
1317
+ session = ClientSession(
1318
+ StdioServerParameters(
1319
+ command="python",
1320
+ args=["-m", "mcp_tools"]
1321
+ )
1322
+ )
1323
+
1324
+ # Call tools
1325
+ result = await session.call_tool(
1326
+ "analyze_leaderboard",
1327
+ arguments={"metric_focus": "cost"}
1328
+ )
1329
+ ```
1330
+
1331
+ ---
1332
+
1333
+ ## 🎯 Use Cases
1334
+
1335
+ ### 1. Interactive Analysis (Agent Chat)
1336
+ Ask natural language questions:
1337
+ - "What are the top 3 models by accuracy?"
1338
+ - "Compare GPT-4 and Claude-3 on cost"
1339
+ - "Why is this agent slow?"
1340
+
1341
+ ### 2. Automated Insights (Leaderboard)
1342
+ Get AI summaries automatically:
1343
+ - Weekly trend reports
1344
+ - Cost optimization recommendations
1345
+ - Performance alerts
1346
+
1347
+ ### 3. Debugging (Trace Detail)
1348
+ Understand agent behavior:
1349
+ - "Why did the agent fail?"
1350
+ - "Which tool took the longest?"
1351
+ - "Why was the same tool called twice?"
1352
+
1353
+ ### 4. Planning (Cost Estimator)
1354
+ Before running evaluations:
1355
+ - "How much will 1000 tests cost?"
1356
+ - "Should I use A10 or H200?"
1357
+ - "What's the CO2 impact?"
1358
+
1359
+ ---
1360
+
1361
+ ## πŸ—οΈ Architecture
1362
+
1363
+ ```
1364
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1365
+ β”‚ TraceMind-MCP-Server (HF Space) β”‚
1366
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
1367
+ β”‚ β”‚
1368
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
1369
+ β”‚ β”‚ Gradio App β”‚ β”‚ MCP Protocol β”‚ β”‚
1370
+ β”‚ β”‚ (UI + SSE) │◄──────►│ Handler β”‚ β”‚
1371
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
1372
+ β”‚ β”‚ β”‚
1373
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
1374
+ β”‚ β”‚ Tool Router β”‚ β”‚
1375
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
1376
+ β”‚ β”‚ β”‚
1377
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
1378
+ β”‚ β”‚ β”‚ β”‚ β”‚
1379
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β” β”Œβ”€β”€β–Όβ”€β”€β–Όβ”€β”€β”
1380
+ β”‚ β”‚ Leaderboard β”‚ β”‚ Cost Estimator β”‚ β”‚ Trace β”‚
1381
+ β”‚ β”‚ Analyzer β”‚ β”‚ β”‚ β”‚Debuggerβ”‚
1382
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1383
+ β”‚ β”‚ β”‚ β”‚ β”‚
1384
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
1385
+ β”‚ β”‚ β”‚
1386
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
1387
+ β”‚ β”‚ Gemini 2.5 Pro β”‚ β”‚
1388
+ β”‚ β”‚ (Analysis Engine) β”‚ β”‚
1389
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
1390
+ β”‚ β”‚
1391
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1392
+ β”‚
1393
+ β”‚ MCP Protocol (SSE)
1394
+ β”‚
1395
+ β–Ό
1396
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1397
+ β”‚ TraceMind-AI (UI) β”‚
1398
+ β”‚ Agent Chat Screen β”‚
1399
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1400
+ ```
1401
+
1402
+ ---
1403
+
1404
+ ## πŸ”§ Configuration
1405
+
1406
+ ### Environment Variables
1407
+
1408
+ ```env
1409
+ # Google Gemini API (required)
1410
+ GEMINI_API_KEY=your_api_key_here
1411
+
1412
+ # HuggingFace Token (for dataset access)
1413
+ HF_TOKEN=your_token_here
1414
+
1415
+ # Default Leaderboard (optional)
1416
+ DEFAULT_LEADERBOARD_REPO=kshitijthakkar/smoltrace-leaderboard
1417
+ ```
1418
+
1419
+ ---
1420
+
1421
+ ## πŸ“Š Dataset Requirements
1422
+
1423
+ MCP tools expect datasets with specific schemas:
1424
+
1425
+ ### Leaderboard Dataset
1426
+ ```python
1427
+ {
1428
+ "run_id": "string",
1429
+ "model": "string",
1430
+ "success_rate": "float",
1431
+ "total_cost_usd": "float",
1432
+ "timestamp": "string",
1433
+ # ... other metrics
1434
+ }
1435
+ ```
1436
+
1437
+ ### Results Dataset
1438
+ ```python
1439
+ {
1440
+ "run_id": "string",
1441
+ "task_id": "string",
1442
+ "success": "boolean",
1443
+ "trace_id": "string",
1444
+ # ... other fields
1445
+ }
1446
+ ```
1447
+
1448
+ ### Traces Dataset
1449
+ ```python
1450
+ {
1451
+ "trace_id": "string",
1452
+ "spans": [
1453
+ {
1454
+ "spanId": "string",
1455
+ "name": "string",
1456
+ "attributes": {},
1457
+ # ... OTEL format
1458
+ }
1459
+ ]
1460
+ }
1461
+ ```
1462
+
1463
+ ---
1464
+
1465
+ ## πŸŽ“ Learning Resources
1466
+
1467
+ ### MCP Documentation
1468
+ - [Model Context Protocol Spec](https://modelcontextprotocol.io)
1469
+ - [MCP Python SDK](https://github.com/modelcontextprotocol/python-sdk)
1470
+ - [Gradio MCP Integration](https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks#model-context-protocol-mcp)
1471
+
1472
+ ### Implementation Examples
1473
+ - **This Server**: [HF Space Code](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server/tree/main)
1474
+ - **Client Integration**: [TraceMind-AI mcp_client/](https://github.com/Mandark-droid/TraceMind-AI/tree/main/mcp_client)
1475
+
1476
+ ---
1477
+
1478
+ ## πŸ› Troubleshooting
1479
+
1480
+ ### Common Issues
1481
+
1482
+ **Q: MCP tools not appearing?**
1483
+ ```bash
1484
+ # Verify MCP_SERVER_URL is correct
1485
+ echo $MCP_SERVER_URL
1486
+
1487
+ # Should be: https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse
1488
+ ```
1489
+
1490
+ **Q: "Failed to load dataset" error?**
1491
+ ```bash
1492
+ # Check HF token
1493
+ export HF_TOKEN=your_token_here
1494
+
1495
+ # Verify dataset exists
1496
+ huggingface-cli repo info kshitijthakkar/smoltrace-leaderboard
1497
+ ```
1498
+
1499
+ **Q: Gemini API errors?**
1500
+ ```bash
1501
+ # Verify API key
1502
+ curl -H "Authorization: Bearer $GEMINI_API_KEY" \\
1503
+ https://generativelanguage.googleapis.com/v1beta/models
1504
+
1505
+ # Check rate limits (10 requests/minute on free tier)
1506
+ ```
1507
+
1508
+ ---
1509
+
1510
+ ## πŸ”— Links
1511
+
1512
+ - **Live Server**: [HF Space](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server)
1513
+ - **Source Code**: [GitHub](https://github.com/Mandark-droid/TraceMind-mcp-server)
1514
+ - **Client (This App)**: [TraceMind-AI](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind)
1515
+ - **MCP Spec**: [modelcontextprotocol.io](https://modelcontextprotocol.io)
1516
+
1517
+ ---
1518
+
1519
+ ## πŸ“„ License
1520
+
1521
+ **AGPL-3.0** - Open source and free to use
1522
+
1523
+ ---
1524
+
1525
+ ## 🀝 Contributing
1526
+
1527
+ Help improve TraceMind-MCP-Server:
1528
+ - Add new MCP tools
1529
+ - Improve analysis quality
1530
+ - Optimize performance
1531
+ - Add support for more datasets
1532
+
1533
+ ---
1534
+
1535
+ ## πŸ† MCP's 1st Birthday Hackathon
1536
+
1537
+ **Track 1 Submission: Building MCP (Enterprise)**
1538
+
1539
+ TraceMind-MCP-Server demonstrates:
1540
+ - βœ… Standards-compliant MCP implementation
1541
+ - βœ… SSE transport for Gradio integration
1542
+ - βœ… Real-world use case (agent evaluation)
1543
+ - βœ… Gemini 2.5 Pro integration
1544
+ - βœ… Production-ready deployment on HF Spaces
1545
+
1546
+ **Used by**: TraceMind-AI (Track 2) for autonomous agent chat
1547
+
1548
+ ---
1549
+
1550
+ *TraceMind-MCP-Server - Intelligent analysis, one tool at a time* πŸ”Œ
1551
+ """)
1552
+
1553
+
1554
+ def create_documentation_screen():
1555
+ """
1556
+ Create the complete documentation screen with tabs
1557
+
1558
+ Returns:
1559
+ gr.Blocks: Gradio Blocks interface for documentation
1560
+ """
1561
+ with gr.Blocks() as documentation_interface:
1562
+ gr.Markdown("""
1563
+ # πŸ“š TraceMind Documentation
1564
+
1565
+ Comprehensive documentation for the entire TraceMind ecosystem
1566
+ """)
1567
+
1568
+ with gr.Tabs():
1569
+ with gr.Tab("πŸ“– About"):
1570
+ create_about_tab()
1571
+
1572
+ with gr.Tab("πŸ”­ TraceVerde"):
1573
+ create_traceverde_tab()
1574
+
1575
+ with gr.Tab("πŸ“Š SmolTrace"):
1576
+ create_smoltrace_tab()
1577
+
1578
+ with gr.Tab("πŸ”Œ TraceMind-MCP-Server"):
1579
+ create_mcp_server_tab()
1580
+
1581
+ gr.Markdown("""
1582
+ ---
1583
+
1584
+ ### πŸ’‘ Quick Navigation
1585
+
1586
+ - **Getting Started**: Start with the "About" tab for ecosystem overview
1587
+ - **Instrumentation**: See "TraceVerde" for adding observability to your agents
1588
+ - **Evaluation**: Check "SmolTrace" for running evaluations
1589
+ - **MCP Integration**: Explore "TraceMind-MCP-Server" for intelligent analysis
1590
+
1591
+ ### πŸ”— External Resources
1592
+
1593
+ - [GitHub Organization](https://github.com/Mandark-droid)
1594
+ - [HuggingFace Spaces](https://huggingface.co/MCP-1st-Birthday)
1595
+ - [MCP Specification](https://modelcontextprotocol.io)
1596
+
1597
+ *Built with ❀️ for MCP's 1st Birthday Hackathon*
1598
+ """)
1599
+
1600
+ return documentation_interface
1601
+
1602
+
1603
+ if __name__ == "__main__":
1604
+ # For standalone testing
1605
+ demo = create_documentation_screen()
1606
+ demo.launch()