Spaces:

MCP-1st-Birthday
/

TraceMind

Running

File size: 79,129 Bytes

"""
TraceMind-AI - Agent Evaluation Platform
Enterprise-grade AI agent evaluation with MCP integration
"""

import os
import pandas as pd
import gradio as gr
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Import data loader and components
from data_loader import create_data_loader_from_env
from components.leaderboard_table import generate_leaderboard_html
from components.analytics_charts import (
    create_trends_plot,
    create_performance_heatmap,
    create_speed_accuracy_scatter,
    create_cost_efficiency_scatter
)
from components.report_cards import generate_leaderboard_summary_card, generate_run_report_card, download_card_as_png_js
from screens.trace_detail import (
    create_span_visualization,
    create_span_table,
    create_gpu_metrics_dashboard,
    create_gpu_summary_cards
)
from screens.dashboard import (
    create_dashboard_ui,
    update_dashboard_data
)
from screens.compare import (
    create_compare_ui,
    on_compare_runs
)
from utils.navigation import Navigator, Screen



# Trace Detail handlers and helpers

def create_span_details_table(spans):
    """
    Create table view of span details

    Args:
        spans: List of span dictionaries

    Returns:
        DataFrame with span details
    """
    try:
        if not spans:
            return pd.DataFrame(columns=["Span Name", "Kind", "Duration (ms)", "Tokens", "Cost (USD)", "Status"])

        rows = []
        for span in spans:
            name = span.get('name', 'Unknown')
            kind = span.get('kind', 'INTERNAL')

            # Get attributes
            attributes = span.get('attributes', {})
            if isinstance(attributes, dict) and 'openinference.span.kind' in attributes:
                kind = attributes.get('openinference.span.kind', kind)

            # Calculate duration
            start = span.get('startTime') or span.get('startTimeUnixNano', 0)
            end = span.get('endTime') or span.get('endTimeUnixNano', 0)
            duration = (end - start) / 1000000 if start and end else 0  # Convert to ms

            status = span.get('status', {}).get('code', 'OK') if isinstance(span.get('status'), dict) else 'OK'

            # Extract tokens and cost information
            tokens_str = "-"
            cost_str = "-"

            if isinstance(attributes, dict):
                # Check for token usage
                prompt_tokens = attributes.get('gen_ai.usage.prompt_tokens') or attributes.get('llm.token_count.prompt')
                completion_tokens = attributes.get('gen_ai.usage.completion_tokens') or attributes.get('llm.token_count.completion')
                total_tokens = attributes.get('llm.usage.total_tokens')

                # Build tokens string
                if prompt_tokens is not None and completion_tokens is not None:
                    total = int(prompt_tokens) + int(completion_tokens)
                    tokens_str = f"{total} ({int(prompt_tokens)}+{int(completion_tokens)})"
                elif total_tokens is not None:
                    tokens_str = str(int(total_tokens))

                # Check for cost
                cost = attributes.get('gen_ai.usage.cost.total') or attributes.get('llm.usage.cost')
                if cost is not None:
                    cost_str = f"${float(cost):.6f}"

            rows.append({
                "Span Name": name,
                "Kind": kind,
                "Duration (ms)": round(duration, 2),
                "Tokens": tokens_str,
                "Cost (USD)": cost_str,
                "Status": status
            })

        return pd.DataFrame(rows)

    except Exception as e:
        print(f"[ERROR] create_span_details_table: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(columns=["Span Name", "Kind", "Duration (ms)", "Tokens", "Cost (USD)", "Status"])


def create_trace_metadata_html(trace_data: dict) -> str:
    """Create HTML for trace metadata display"""
    trace_id = trace_data.get('trace_id', 'Unknown')
    spans = trace_data.get('spans', [])
    if hasattr(spans, 'tolist'):
        spans = spans.tolist()
    elif not isinstance(spans, list):
        spans = list(spans) if spans is not None else []

    metadata_html = f"""
    <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
        <h3 style="margin: 0 0 10px 0;">Trace Information</h3>
        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px;">
            <div>
                <strong>Trace ID:</strong> {trace_id}<br>
                <strong>Total Spans:</strong> {len(spans)}
            </div>
        </div>
    </div>
    """
    return metadata_html


def on_test_case_select(evt: gr.SelectData, df):
    """Handle test case selection in run detail - navigate to trace detail"""
    global current_selected_run, current_selected_trace

    print(f"[DEBUG] on_test_case_select called with index: {evt.index}")

    # Check if we have a selected run
    if current_selected_run is None:
        print("[ERROR] No run selected - current_selected_run is None")
        gr.Warning("Please select a run from the leaderboard first")
        return {}

    try:
        # Get selected test case
        selected_idx = evt.index[0]
        if df is None or df.empty or selected_idx >= len(df):
            gr.Warning("Invalid test case selection")
            return {}

        test_case = df.iloc[selected_idx].to_dict()
        trace_id = test_case.get('trace_id')

        print(f"[DEBUG] Selected test case: {test_case.get('task_id', 'Unknown')} (trace_id: {trace_id})")

        # Load trace data
        traces_dataset = current_selected_run.get('traces_dataset')
        if not traces_dataset:
            gr.Warning("No traces dataset found in current run")
            return {}

        trace_data = data_loader.get_trace_by_id(traces_dataset, trace_id)

        if not trace_data:
            gr.Warning(f"Trace not found: {trace_id}")
            return {}

        current_selected_trace = trace_data

        # Get spans and ensure it's a list
        spans = trace_data.get('spans', [])
        if hasattr(spans, 'tolist'):
            spans = spans.tolist()
        elif not isinstance(spans, list):
            spans = list(spans) if spans is not None else []

        print(f"[DEBUG] Loaded trace with {len(spans)} spans")

        # Create visualizations
        span_viz_plot = create_span_visualization(spans, trace_id)
        # Process spans for JSON display (create_span_table returns gr.JSON component, we need the data)
        simplified_spans = []
        for span in spans:
            # Helper to get timestamp
            def get_timestamp(s, field_name):
                variations = [field_name, field_name.lower(), field_name.replace('Time', 'TimeUnixNano')]
                for var in variations:
                    if var in s:
                        value = s[var]
                        return int(value) if isinstance(value, str) else value
                return 0

            start_time = get_timestamp(span, 'startTime')
            end_time = get_timestamp(span, 'endTime')
            duration_ms = (end_time - start_time) / 1000000 if (end_time and start_time) else 0

            span_id = span.get('spanId') or span.get('span_id') or 'N/A'
            parent_id = span.get('parentSpanId') or span.get('parent_span_id') or 'root'

            simplified_spans.append({
                "Span ID": span_id,
                "Parent": parent_id,
                "Name": span.get('name', 'N/A'),
                "Kind": span.get('kind', 'N/A'),
                "Duration (ms)": round(duration_ms, 2),
                "Attributes": span.get('attributes', {}),
                "Status": span.get('status', {}).get('code', 'UNKNOWN')
            })

        span_details_data = simplified_spans

        # Create thought graph
        from components.thought_graph import create_thought_graph as create_network_graph
        thought_graph_plot = create_network_graph(spans, trace_id)

        # Create span details table
        span_table_df = create_span_details_table(spans)

        # Load GPU metrics (if available)
        gpu_summary_html = "<div style='padding: 20px; text-align: center;'>⚠️ No GPU metrics available (expected for API models)</div>"
        gpu_plot = None
        gpu_json_data = {}

        try:
            if 'metrics_dataset' in current_selected_run and current_selected_run['metrics_dataset']:
                metrics_dataset = current_selected_run['metrics_dataset']
                gpu_metrics_data = data_loader.load_metrics(metrics_dataset)

                if gpu_metrics_data is not None and not gpu_metrics_data.empty:
                    gpu_plot = create_gpu_metrics_dashboard(gpu_metrics_data)
                    gpu_summary_html = create_gpu_summary_cards(gpu_metrics_data)
                    gpu_json_data = gpu_metrics_data.to_dict('records')
        except Exception as e:
            print(f"[WARNING] Could not load GPU metrics: {e}")

        # Return dictionary with visibility updates and data
        return {
            run_detail_screen: gr.update(visible=False),
            trace_detail_screen: gr.update(visible=True),
            trace_title: gr.update(value=f"# 🔍 Trace Detail: {trace_id}"),
            trace_metadata_html: gr.update(value=create_trace_metadata_html(trace_data)),
            trace_thought_graph: gr.update(value=thought_graph_plot),
            span_visualization: gr.update(value=span_viz_plot),
            span_details_table: gr.update(value=span_table_df),
            span_details_json: gr.update(value=span_details_data),
            gpu_summary_cards_html: gr.update(value=gpu_summary_html),
            gpu_metrics_plot: gr.update(value=gpu_plot),
            gpu_metrics_json: gr.update(value=gpu_json_data)
        }

    except Exception as e:
        print(f"[ERROR] on_test_case_select failed: {e}")
        import traceback
        traceback.print_exc()
        gr.Warning(f"Error loading trace: {e}")
        return {}



def create_performance_charts(results_df):
    """
    Create performance analysis charts for the Performance tab

    Args:
        results_df: DataFrame with test results

    Returns:
        Plotly figure with performance metrics
    """
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    try:
        if results_df.empty:
            fig = go.Figure()
            fig.add_annotation(text="No performance data available", showarrow=False)
            return fig

        # Create 2x2 subplots
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                "Response Time Distribution",
                "Token Usage per Test",
                "Cost per Test",
                "Success vs Failure"
            ),
            specs=[[{"type": "histogram"}, {"type": "bar"}],
                   [{"type": "bar"}, {"type": "pie"}]]
        )

        # 1. Response Time Distribution (Histogram)
        if 'execution_time_ms' in results_df.columns:
            fig.add_trace(
                go.Histogram(
                    x=results_df['execution_time_ms'],
                    nbinsx=20,
                    marker_color='#3498DB',
                    name='Response Time',
                    showlegend=False
                ),
                row=1, col=1
            )
            fig.update_xaxes(title_text="Time (ms)", row=1, col=1)
            fig.update_yaxes(title_text="Count", row=1, col=1)

        # 2. Token Usage per Test (Bar)
        if 'total_tokens' in results_df.columns:
            test_indices = list(range(len(results_df)))
            fig.add_trace(
                go.Bar(
                    x=test_indices,
                    y=results_df['total_tokens'],
                    marker_color='#9B59B6',
                    name='Tokens',
                    showlegend=False
                ),
                row=1, col=2
            )
            fig.update_xaxes(title_text="Test Index", row=1, col=2)
            fig.update_yaxes(title_text="Tokens", row=1, col=2)

        # 3. Cost per Test (Bar)
        if 'cost_usd' in results_df.columns:
            test_indices = list(range(len(results_df)))
            fig.add_trace(
                go.Bar(
                    x=test_indices,
                    y=results_df['cost_usd'],
                    marker_color='#E67E22',
                    name='Cost',
                    showlegend=False
                ),
                row=2, col=1
            )
            fig.update_xaxes(title_text="Test Index", row=2, col=1)
            fig.update_yaxes(title_text="Cost (USD)", row=2, col=1)

        # 4. Success vs Failure (Pie)
        if 'success' in results_df.columns:
            # Convert to boolean if needed
            success_series = results_df['success']
            if success_series.dtype == object:
                success_series = success_series == "✅"

            success_count = int(success_series.sum())
            failure_count = len(results_df) - success_count

            fig.add_trace(
                go.Pie(
                    labels=['Success', 'Failure'],
                    values=[success_count, failure_count],
                    marker_colors=['#2ECC71', '#E74C3C'],
                    showlegend=True
                ),
                row=2, col=2
            )

        # Update layout
        fig.update_layout(
            height=700,
            showlegend=False,
            title_text="Performance Analysis Dashboard",
            title_x=0.5
        )

        return fig

    except Exception as e:
        print(f"[ERROR] create_performance_charts: {e}")
        import traceback
        traceback.print_exc()
        fig = go.Figure()
        fig.add_annotation(text=f"Error creating charts: {str(e)}", showarrow=False)
        return fig



def go_back_to_run_detail():
    """Navigate from trace detail back to run detail"""
    return {
        run_detail_screen: gr.update(visible=True),
        trace_detail_screen: gr.update(visible=False)
    }


# Initialize data loader
data_loader = create_data_loader_from_env()
navigator = Navigator()

# Pre-load and cache the leaderboard data before building UI
print("Pre-loading leaderboard data from HuggingFace...")
leaderboard_df_cache = data_loader.load_leaderboard()
print(f"Loaded {len(leaderboard_df_cache)} evaluation runs")

# Global state (already populated)
# leaderboard_df_cache is now set

# Additional global state for navigation
current_selected_run = None
current_selected_trace = None
current_drilldown_df = None  # Store currently displayed drilldown data


def load_leaderboard():
    """Load initial leaderboard data from cache"""
    global leaderboard_df_cache

    # Use pre-cached data (already loaded before UI build)
    df = leaderboard_df_cache.copy()

    html = generate_leaderboard_html(df)

    # Get filter choices
    models = ["All Models"] + sorted(df['model'].unique().tolist())
    providers = ["All"] + sorted(df['provider'].unique().tolist())

    return html, gr.update(choices=models), gr.update(choices=models), gr.update(choices=providers)


def refresh_leaderboard():
    """Refresh leaderboard data from source (for reload button)"""
    global leaderboard_df_cache

    print("🔄 Refreshing leaderboard data...")
    df = data_loader.refresh_leaderboard()  # Clears cache and reloads
    leaderboard_df_cache = df.copy()
    print(f"✅ Refreshed {len(df)} evaluation runs")

    html = generate_leaderboard_html(df)
    models = ["All Models"] + sorted(df['model'].unique().tolist())

    return html, gr.update(choices=models), gr.update(choices=models)


def apply_leaderboard_filters(agent_type, provider, sort_by_col, sort_order):
    """Apply filters and sorting to styled HTML leaderboard"""
    global leaderboard_df_cache, model_filter

    df = leaderboard_df_cache.copy() if leaderboard_df_cache is not None else data_loader.load_leaderboard()

    # Apply model filter from sidebar
    selected_model = model_filter.value if hasattr(model_filter, 'value') else "All Models"
    if selected_model != "All Models":
        df = df[df['model'] == selected_model]

    # Apply agent type filter
    if agent_type != "All":
        df = df[df['agent_type'] == agent_type]

    # Apply provider filter
    if provider != "All":
        df = df[df['provider'] == provider]

    # Sort
    ascending = (sort_order == "Ascending")
    df = df.sort_values(by=sort_by_col, ascending=ascending)

    html = generate_leaderboard_html(df, sort_by_col, ascending)
    return html


def apply_drilldown_filters(agent_type, provider, sort_by_col, sort_order):
    """Apply filters and sorting to drilldown table"""
    global leaderboard_df_cache

    df = leaderboard_df_cache.copy() if leaderboard_df_cache is not None else data_loader.load_leaderboard()

    # Apply model filter from sidebar
    selected_model = model_filter.value if hasattr(model_filter, 'value') else "All Models"
    if selected_model != "All Models":
        df = df[df['model'] == selected_model]

    # Apply agent type filter
    if agent_type != "All":
        df = df[df['agent_type'] == agent_type]

    # Apply provider filter
    if provider != "All":
        df = df[df['provider'] == provider]

    # Sort
    ascending = (sort_order == "Ascending")
    df = df.sort_values(by=sort_by_col, ascending=ascending).reset_index(drop=True)

    # Prepare simplified dataframe for display
    display_df = df[[
        'run_id', 'model', 'agent_type', 'provider', 'success_rate',
        'total_tests', 'avg_duration_ms', 'total_cost_usd', 'submitted_by'
    ]].copy()
    display_df.columns = ['Run ID', 'Model', 'Agent Type', 'Provider', 'Success Rate', 'Tests', 'Duration (ms)', 'Cost (USD)', 'Submitted By']

    return gr.update(value=display_df)


def apply_sidebar_filters(selected_model, selected_agent_type):
    """Apply sidebar filters to both leaderboard tabs"""
    global leaderboard_df_cache

    df = leaderboard_df_cache.copy() if leaderboard_df_cache is not None else data_loader.load_leaderboard()

    # Apply model filter
    if selected_model != "All Models":
        df = df[df['model'] == selected_model]

    # Apply agent type filter
    if selected_agent_type != "All":
        df = df[df['agent_type'] == selected_agent_type]

    # For HTML leaderboard
    sorted_df = df.sort_values(by='success_rate', ascending=False).reset_index(drop=True)
    html = generate_leaderboard_html(sorted_df, 'success_rate', False)

    # For drilldown table
    display_df = df[[
        'run_id', 'model', 'agent_type', 'provider', 'success_rate',
        'total_tests', 'avg_duration_ms', 'total_cost_usd', 'submitted_by'
    ]].copy()
    display_df.columns = ['Run ID', 'Model', 'Agent Type', 'Provider', 'Success Rate', 'Tests', 'Duration (ms)', 'Cost (USD)', 'Submitted By']

    # Update trends
    trends_fig = create_trends_plot(df)

    # Update compare dropdowns
    compare_choices = []
    for _, row in df.iterrows():
        label = f"{row.get('model', 'Unknown')} - {row.get('timestamp', 'N/A')}"
        # Use composite key: run_id|timestamp to ensure uniqueness
        value = f"{row.get('run_id', '')}|{row.get('timestamp', '')}"
        if value:
            compare_choices.append((label, value))

    return {
        leaderboard_by_model: gr.update(value=html),
        leaderboard_table: gr.update(value=display_df),
        trends_plot: gr.update(value=trends_fig),
        compare_components['compare_run_a_dropdown']: gr.update(choices=compare_choices),
        compare_components['compare_run_b_dropdown']: gr.update(choices=compare_choices)
    }


def load_drilldown(agent_type, provider):
    """Load drilldown data with filters"""
    global current_drilldown_df

    try:
        df = data_loader.load_leaderboard()

        if df.empty:
            current_drilldown_df = pd.DataFrame()
            return pd.DataFrame()

        if agent_type != "All" and 'agent_type' in df.columns:
            df = df[df['agent_type'] == agent_type]
        if provider != "All" and 'provider' in df.columns:
            df = df[df['provider'] == provider]

        # IMPORTANT: Store the FULL dataframe in global state (with ALL columns)
        # This ensures the event handler has access to results_dataset, traces_dataset, etc.
        current_drilldown_df = df.copy()

        # Select only columns for DISPLAY
        desired_columns = [
            'run_id', 'model', 'agent_type', 'provider',
            'success_rate', 'total_tests', 'avg_duration_ms', 'total_cost_usd'
        ]

        # Filter to only existing columns
        available_columns = [col for col in desired_columns if col in df.columns]

        if not available_columns:
            # If no desired columns exist, return empty dataframe
            return pd.DataFrame()

        display_df = df[available_columns].copy()

        # Return ONLY display columns for the UI table
        return display_df
    except Exception as e:
        print(f"[ERROR] load_drilldown: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()


def load_trends():
    """Load trends visualization"""
    df = data_loader.load_leaderboard()
    fig = create_trends_plot(df)
    return fig


def get_chart_explanation(viz_type):
    """Get explanation text for the selected chart type"""
    explanations = {
        "🔥 Performance Heatmap": """
#### 🔥 Performance Heatmap

**What it shows:** All models compared across all metrics in one view

**How to read it:**
- 🟢 **Green cells** = Better performance (higher is better)
- 🟡 **Yellow cells** = Average performance
- 🔴 **Red cells** = Worse performance (needs improvement)

**Metrics displayed:**
- Success Rate (%), Avg Duration (ms), Total Cost ($)
- CO2 Emissions (g), GPU Utilization (%), Total Tokens

**Use it to:** Quickly identify which models excel in which areas
        """,

        "⚡ Speed vs Accuracy": """
#### ⚡ Speed vs Accuracy Trade-off

**What it shows:** The relationship between model speed and accuracy

**How to read it:**
- **X-axis** = Average Duration (log scale) - left is faster
- **Y-axis** = Success Rate (%) - higher is better
- **Bubble size** = Total Cost - larger bubbles are more expensive
- **Color** = Agent Type (tool/code/both)

**Sweet spot:** Top-left quadrant = ⭐ **Fast & Accurate** models

**Quadrant lines:**
- Median lines split the chart into 4 zones
- Models above/left of medians are better than average

**Use it to:** Find models that balance speed and accuracy for your needs
        """,

        "💰 Cost Efficiency": """
#### 💰 Cost-Performance Efficiency

**What it shows:** Best value-for-money models

**How to read it:**
- **X-axis** = Total Cost (log scale) - left is cheaper
- **Y-axis** = Success Rate (%) - higher is better
- **Bubble size** = Duration - smaller bubbles are faster
- **Color** = Provider (blue=API, green=GPU/local)
- **⭐ Stars** = Top 3 most efficient models

**Cost bands:**
- 🟢 **Budget** = < $0.01 per run
- 🟡 **Mid-Range** = $0.01 - $0.10 per run
- 🟠 **Premium** = > $0.10 per run

**Efficiency metric:** Success Rate ÷ Cost (higher is better)

**Use it to:** Maximize ROI by finding models with best success-to-cost ratio
        """
    }

    return explanations.get(viz_type, explanations["🔥 Performance Heatmap"])


def update_analytics(viz_type):
    """Update analytics chart and explanation based on visualization type"""
    df = data_loader.load_leaderboard()

    # Get chart
    if "Heatmap" in viz_type:
        chart = create_performance_heatmap(df)
    elif "Speed" in viz_type:
        chart = create_speed_accuracy_scatter(df)
    else:
        chart = create_cost_efficiency_scatter(df)

    # Get explanation
    explanation = get_chart_explanation(viz_type)

    return chart, explanation


def generate_card(top_n):
    """Generate summary card HTML"""
    df = data_loader.load_leaderboard()

    if df is None or df.empty:
        return "<p>No data available</p>", gr.update(visible=False)

    html = generate_leaderboard_summary_card(df, top_n)
    return html, gr.update(visible=True)


def generate_insights():
    """Generate AI insights summary"""
    try:
        df = data_loader.load_leaderboard()

        if df.empty or 'success_rate' not in df.columns:
            return "## 📊 Leaderboard Summary\n\nNo data available for insights."

        top_model = df.loc[df['success_rate'].idxmax()]
        most_cost_effective = df.loc[(df['success_rate'] / (df['total_cost_usd'] + 0.0001)).idxmax()]
        fastest = df.loc[df['avg_duration_ms'].idxmin()]

        insights = f"""
## 📊 Leaderboard Summary

**Total Runs:** {len(df)}

**Top Performers:**
- 🥇 **Best Accuracy:** {top_model['model']} ({top_model['success_rate']:.1f}%)
- 💰 **Most Cost-Effective:** {most_cost_effective['model']} ({most_cost_effective['success_rate']:.1f}% @ ${most_cost_effective['total_cost_usd']:.4f})
- ⚡ **Fastest:** {fastest['model']} ({fastest['avg_duration_ms']:.0f}ms avg)

**Key Trends:**
- Average Success Rate: {df['success_rate'].mean():.1f}%
- Average Cost: ${df['total_cost_usd'].mean():.4f}
- Average Duration: {df['avg_duration_ms'].mean():.0f}ms

---

*Note: AI-powered insights will be available via MCP integration in the full version.*
        """

        return insights
    except Exception as e:
        print(f"[ERROR] generate_insights: {e}")
        import traceback
        traceback.print_exc()
        return f"## 📊 Leaderboard Summary\n\nError generating insights: {str(e)}"


def on_html_table_row_click(row_index_str):
    """Handle row click from HTML table via JavaScript (hidden textbox bridge)"""
    global current_selected_run, leaderboard_df_cache

    print(f"[DEBUG] on_html_table_row_click called with: '{row_index_str}'")

    try:
        # Parse row index from string
        if not row_index_str or row_index_str == "" or row_index_str.strip() == "":
            print("[DEBUG] Empty row index, ignoring")
            return {
                leaderboard_screen: gr.update(),
                run_detail_screen: gr.update(),
                run_metadata_html: gr.update(),
                test_cases_table: gr.update(),
                run_card_html: gr.update(),
                selected_row_index: gr.update(value="")  # Clear textbox
            }

        selected_idx = int(row_index_str)
        print(f"[DEBUG] Parsed row index: {selected_idx}")

        # Get the full run data from cache
        if leaderboard_df_cache is None or leaderboard_df_cache.empty:
            print("[ERROR] Leaderboard cache is empty")
            gr.Warning("Leaderboard data not loaded")
            return {
                leaderboard_screen: gr.update(),
                run_detail_screen: gr.update(),
                run_metadata_html: gr.update(),
                test_cases_table: gr.update(),
                run_card_html: gr.update(),
                selected_row_index: gr.update(value="")  # Clear textbox
            }

        if selected_idx < 0 or selected_idx >= len(leaderboard_df_cache):
            print(f"[ERROR] Invalid row index: {selected_idx}, cache size: {len(leaderboard_df_cache)}")
            gr.Warning(f"Invalid row index: {selected_idx}")
            return {
                leaderboard_screen: gr.update(),
                run_detail_screen: gr.update(),
                run_metadata_html: gr.update(),
                test_cases_table: gr.update(),
                run_card_html: gr.update(),
                selected_row_index: gr.update(value="")  # Clear textbox
            }

        run_data = leaderboard_df_cache.iloc[selected_idx].to_dict()

        # Set global
        current_selected_run = run_data

        print(f"[DEBUG] Selected run from HTML table: {run_data.get('model', 'Unknown')} (row {selected_idx})")

        # Load results for this run
        results_dataset = run_data.get('results_dataset')
        if not results_dataset:
            gr.Warning("No results dataset found for this run")
            return {
                leaderboard_screen: gr.update(visible=True),
                run_detail_screen: gr.update(visible=False),
                run_metadata_html: gr.update(value="<h3>No results dataset found</h3>"),
                test_cases_table: gr.update(value=pd.DataFrame()),
                selected_row_index: gr.update(value="")
            }

        results_df = data_loader.load_results(results_dataset)

        # Generate performance chart
        perf_chart = create_performance_charts(results_df)

        # Create metadata HTML
        metadata_html = f"""
        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
            <h2 style="margin: 0 0 10px 0;">📊 Run Detail: {run_data.get('model', 'Unknown')}</h2>
            <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
                <div>
                    <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
                    <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
                    <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
                </div>
                <div>
                    <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
                    <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
                    <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
                </div>
                <div>
                    <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
                    <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
                    <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
                </div>
            </div>
        </div>
        """

        # Generate run report card HTML
        run_card_html_content = generate_run_report_card(run_data)

        # Format results for display
        display_df = results_df.copy()

        # Select and format columns if they exist
        display_columns = []
        if 'task_id' in display_df.columns:
            display_columns.append('task_id')
        if 'success' in display_df.columns:
            display_df['success'] = display_df['success'].apply(lambda x: "✅" if x else "❌")
            display_columns.append('success')
        if 'tool_called' in display_df.columns:
            display_columns.append('tool_called')
        if 'execution_time_ms' in display_df.columns:
            display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
            display_columns.append('execution_time_ms')
        if 'total_tokens' in display_df.columns:
            display_columns.append('total_tokens')
        if 'cost_usd' in display_df.columns:
            display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
            display_columns.append('cost_usd')
        if 'trace_id' in display_df.columns:
            display_columns.append('trace_id')

        if display_columns:
            display_df = display_df[display_columns]

        print(f"[DEBUG] Successfully loaded run detail for: {run_data.get('model', 'Unknown')}")

        return {
            # Hide leaderboard, show run detail
            leaderboard_screen: gr.update(visible=False),
            run_detail_screen: gr.update(visible=True),
            run_metadata_html: gr.update(value=metadata_html),
            test_cases_table: gr.update(value=display_df),
            run_card_html: gr.update(value=run_card_html_content),
            selected_row_index: gr.update(value="")  # Clear textbox
        }

    except Exception as e:
        print(f"[ERROR] Handling HTML table row click: {e}")
        import traceback
        traceback.print_exc()
        gr.Warning(f"Error loading run details: {str(e)}")
        return {
            leaderboard_screen: gr.update(visible=True),  # Stay on leaderboard
            run_detail_screen: gr.update(visible=False),
            run_metadata_html: gr.update(),
            test_cases_table: gr.update(),
            run_card_html: gr.update(),
            selected_row_index: gr.update(value="")  # Clear textbox
        }


def load_run_detail(run_id):
    """Load run detail data including results dataset"""
    global current_selected_run, leaderboard_df_cache

    try:
        # Find run in cache
        df = leaderboard_df_cache
        run_data = df[df['run_id'] == run_id].iloc[0].to_dict()
        current_selected_run = run_data

        # Load results dataset
        results_dataset = run_data.get('results_dataset')
        if not results_dataset:
            return pd.DataFrame(), f"# Error\n\nNo results dataset found for this run", ""

        results_df = data_loader.load_results(results_dataset)

        # Generate performance chart
        perf_chart = create_performance_charts(results_df)

        # Create metadata HTML
        metadata_html = f"""
        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
            <h2 style="margin: 0 0 10px 0;">📊 Run Detail: {run_data.get('model', 'Unknown')}</h2>
            <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
                <div>
                    <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
                    <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
                    <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
                </div>
                <div>
                    <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
                    <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
                    <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
                </div>
                <div>
                    <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
                    <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
                    <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
                </div>
            </div>
        </div>
        """

        # Generate run report card HTML
        run_card_html_content = generate_run_report_card(run_data)

        # Format results for display
        display_df = results_df.copy()

        # Select and format columns if they exist
        display_columns = []
        if 'task_id' in display_df.columns:
            display_columns.append('task_id')
        if 'success' in display_df.columns:
            display_df['success'] = display_df['success'].apply(lambda x: "✅" if x else "❌")
            display_columns.append('success')
        if 'tool_called' in display_df.columns:
            display_columns.append('tool_called')
        if 'execution_time_ms' in display_df.columns:
            display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
            display_columns.append('execution_time_ms')
        if 'total_tokens' in display_df.columns:
            display_columns.append('total_tokens')
        if 'cost_usd' in display_df.columns:
            display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
            display_columns.append('cost_usd')
        if 'trace_id' in display_df.columns:
            display_columns.append('trace_id')

        if display_columns:
            display_df = display_df[display_columns]

        return display_df, metadata_html, run_data.get('run_id', '')

    except Exception as e:
        print(f"[ERROR] load_run_detail: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), f"# Error\n\nError loading run detail: {str(e)}", ""



# Screen 3 (Run Detail) event handlers
def on_drilldown_select(evt: gr.SelectData, df):
    """Handle row selection from DrillDown table - EXACT COPY from MockTraceMind"""
    global current_selected_run, current_drilldown_df

    try:
        # Get selected run - use currently displayed dataframe (filtered/sorted)
        selected_idx = evt.index[0]

        # Get the full run data from the displayed dataframe
        # This ensures we get the correct row even after filtering/sorting
        if current_drilldown_df is not None and not current_drilldown_df.empty:
            if selected_idx < len(current_drilldown_df):
                run_data = current_drilldown_df.iloc[selected_idx].to_dict()
            else:
                gr.Warning(f"Invalid row selection: index {selected_idx} out of bounds")
                return {}
        else:
            gr.Warning("Leaderboard data not available")
            return {}

        # IMPORTANT: Set global FIRST before any operations that might fail
        current_selected_run = run_data

        print(f"[DEBUG] Selected run: {run_data.get('model', 'Unknown')} (run_id: {run_data.get('run_id', 'N/A')[:8]}...)")

        # Load results for this run
        results_dataset = run_data.get('results_dataset')
        if not results_dataset:
            gr.Warning("No results dataset found for this run")
            return {
                leaderboard_screen: gr.update(visible=True),
                run_detail_screen: gr.update(visible=False),
                run_metadata_html: gr.update(value="<h3>No results dataset found</h3>"),
                test_cases_table: gr.update(value=pd.DataFrame()),
                performance_charts: gr.update(),
                run_card_html: gr.update()
            }

        results_df = data_loader.load_results(results_dataset)

        # Generate performance chart
        perf_chart = create_performance_charts(results_df)

        # Create metadata HTML
        metadata_html = f"""
        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
            <h2 style="margin: 0 0 10px 0;">📊 Run Detail: {run_data.get('model', 'Unknown')}</h2>
            <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
                <div>
                    <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
                    <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
                    <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
                </div>
                <div>
                    <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
                    <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
                    <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
                </div>
                <div>
                    <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
                    <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
                    <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
                </div>
            </div>
        </div>
        """

        # Generate run report card HTML
        run_card_html_content = generate_run_report_card(run_data)

        # Format results for display
        display_df = results_df.copy()

        # Select and format columns if they exist
        display_columns = []
        if 'task_id' in display_df.columns:
            display_columns.append('task_id')
        if 'success' in display_df.columns:
            display_df['success'] = display_df['success'].apply(lambda x: "✅" if x else "❌")
            display_columns.append('success')
        if 'tool_called' in display_df.columns:
            display_columns.append('tool_called')
        if 'execution_time_ms' in display_df.columns:
            display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
            display_columns.append('execution_time_ms')
        if 'total_tokens' in display_df.columns:
            display_columns.append('total_tokens')
        if 'cost_usd' in display_df.columns:
            display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
            display_columns.append('cost_usd')
        if 'trace_id' in display_df.columns:
            display_columns.append('trace_id')

        if display_columns:
            display_df = display_df[display_columns]

        print(f"[DEBUG] Successfully loaded run detail for: {run_data.get('model', 'Unknown')}")

        return {
            # Hide leaderboard, show run detail
            leaderboard_screen: gr.update(visible=False),
            run_detail_screen: gr.update(visible=True),
            run_metadata_html: gr.update(value=metadata_html),
            test_cases_table: gr.update(value=display_df),
            performance_charts: gr.update(value=perf_chart),
            run_card_html: gr.update(value=run_card_html_content)
        }

    except Exception as e:
        print(f"[ERROR] Loading run details: {e}")
        import traceback
        traceback.print_exc()
        gr.Warning(f"Error loading run details: {e}")

        # Return updates for all output components to avoid Gradio error
        return {
            leaderboard_screen: gr.update(visible=True),  # Stay on leaderboard
            run_detail_screen: gr.update(visible=False),
            run_metadata_html: gr.update(value="<h3>Error loading run detail</h3>"),
            test_cases_table: gr.update(value=pd.DataFrame()),
            performance_charts: gr.update(),
            run_card_html: gr.update()
        }



def go_back_to_leaderboard():
    """Navigate back to leaderboard screen"""
    return {
        leaderboard_screen: gr.update(visible=True),
        run_detail_screen: gr.update(visible=False)
    }


# Build Gradio app
# Theme configuration (like MockTraceMind)
theme = gr.themes.Base(
    primary_hue="indigo",
    secondary_hue="purple",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Inter"),
).set(
    body_background_fill="*neutral_50",
    body_background_fill_dark="*neutral_900",
    button_primary_background_fill="*primary_500",
    button_primary_background_fill_hover="*primary_600",
    button_primary_text_color="white",
)

with gr.Blocks(title="TraceMind-AI", theme=theme) as app:

    # Top Banner
    gr.HTML("""
    <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                padding: 25px;
                border-radius: 10px;
                margin-bottom: 20px;
                text-align: center;
                box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
        <h1 style="color: white !important; margin: 0; font-size: 2.5em; font-weight: bold;">
            🧠 TraceMind
        </h1>
        <p style="color: rgba(255,255,255,0.9); margin: 10px 0 0 0; font-size: 1.2em;">
            Agent Evaluation Platform
        </p>
        <p style="color: rgba(255,255,255,0.8); margin: 10px 0 0 0; font-size: 0.9em;">
            Powered by Gradio 🚀 | HuggingFace Jobs | TraceVerde | SmolTrace | MCP | Gemini | Modal
        </p>
    </div>
    """)

    # Main app container (wraps Sidebar + all screens like MockTraceMind)
    with gr.Column() as main_app_container:

        
        # Sidebar Navigation
        with gr.Sidebar():
            gr.Markdown("## 🧠 TraceMind")
            gr.Markdown("*Navigation & Controls*")
    
            gr.Markdown("---")
    
            # Navigation section
            gr.Markdown("### 🧭 Navigation")

            # Navigation buttons
            dashboard_nav_btn = gr.Button("📊 Dashboard", variant="primary", size="lg")
            leaderboard_nav_btn = gr.Button("🏆 Leaderboard", variant="secondary", size="lg")
            compare_nav_btn = gr.Button("⚖️ Compare", variant="secondary", size="lg")
            docs_nav_btn = gr.Button("📚 Documentation", variant="secondary", size="lg")
    
            gr.Markdown("---")
    
            # Data Controls
            gr.Markdown("### 🔄 Data Controls")
            refresh_leaderboard_btn = gr.Button("🔄 Refresh Data", variant="secondary", size="sm")
            gr.Markdown("*Reload leaderboard from HuggingFace*")
    
            gr.Markdown("---")
    
            # Filters section
            gr.Markdown("### 🔍 Filters")

            model_filter = gr.Dropdown(
                choices=["All Models"],
                value="All Models",
                label="Model",
                info="Filter evaluations by AI model. Select 'All Models' to see all runs."
            )

            sidebar_agent_type_filter = gr.Radio(
                choices=["All", "tool", "code", "both"],
                value="All",
                label="Agent Type",
                info="Tool: Function calling agents | Code: Code execution | Both: Hybrid agents"
            )

        # Main content area
        # Screen 0: Dashboard
        dashboard_screen, dashboard_components = create_dashboard_ui()

        # Screen 1: Main Leaderboard
        with gr.Column(visible=False) as leaderboard_screen:
            gr.Markdown("## 🏆 Agent Evaluation Leaderboard")
            with gr.Tabs():
                with gr.TabItem("🏆 Leaderboard"):
                    gr.Markdown("*Styled leaderboard with inline filters*")

                    # User Guide Accordion
                    with gr.Accordion("📖 How to Use the Leaderboard", open=False):
                        gr.Markdown("""
                        ### 🏆 Interactive Leaderboard View

                        **What is this tab?**
                        The main leaderboard displays all evaluation runs in a styled HTML table with color-coded performance indicators.

                        **How to use it:**
                        - 🎨 **Visual Design**: Gradient cards with model logos and performance metrics
                        - 🔍 **Filters**: Use agent type, provider, and sorting controls above
                        - 📊 **Sort Options**: Click "Sort By" to order by success rate, cost, duration, or tokens

                        **Performance Indicators:**
                        - 🟢 Green metrics = Excellent performance
                        - 🟡 Yellow metrics = Average performance
                        - 🔴 Red metrics = Needs improvement

                        **Tips:**
                        - Use sidebar filters to narrow down by model
                        - Apply inline filters for more granular control
                        - Switch to "DrillDown" tab for a raw table view
                        """)

                    # Inline filters for styled leaderboard
                    with gr.Row():
                        with gr.Column(scale=1):
                            agent_type_filter = gr.Radio(
                                choices=["All", "tool", "code", "both"],
                                value="All",
                                label="Agent Type",
                                info="Filter by agent type"
                            )
                        with gr.Column(scale=1):
                            provider_filter = gr.Dropdown(
                                choices=["All"],
                                value="All",
                                label="Provider",
                                info="Filter by provider"
                            )
                        with gr.Column(scale=1):
                            sort_by_dropdown = gr.Dropdown(
                                choices=["success_rate", "total_cost_usd", "avg_duration_ms", "total_tokens"],
                                value="success_rate",
                                label="Sort By"
                            )
                        with gr.Column(scale=1):
                            sort_order = gr.Radio(
                                choices=["Descending", "Ascending"],
                                value="Descending",
                                label="Sort Order"
                            )

                    with gr.Row():
                        apply_filters_btn = gr.Button("🔍 Apply Filters", variant="primary", size="sm")

                    # Styled HTML leaderboard
                    leaderboard_by_model = gr.HTML(label="Styled Leaderboard")
    
                with gr.TabItem("📋 DrillDown"):
                    gr.Markdown("*Click any row to view detailed run information*")

                    # User Guide Accordion
                    with gr.Accordion("📖 How to Use DrillDown", open=False):
                        gr.Markdown("""
                        ### 📋 Data Table View

                        **What is this tab?**
                        The DrillDown tab provides a raw, sortable table view of all evaluation runs with full details.

                        **How to use it:**
                        - 📊 **Table Format**: Clean, spreadsheet-like view of all runs
                        - 🔍 **Filters**: Apply agent type, provider, and sorting controls
                        - 📥 **Export Ready**: Easy to copy/paste data for reports
                        - 👆 **Click Rows**: Click any row to navigate to detailed run view
                        - 🔢 **All Metrics**: Shows run ID, model, success rate, cost, duration, and more

                        **Columns Explained:**
                        - **Run ID**: Unique identifier for each evaluation
                        - **Model**: AI model that was evaluated
                        - **Agent Type**: tool (function calling), code (code execution), or both
                        - **Provider**: litellm (API models) or transformers (local models)
                        - **Success Rate**: Percentage of test cases passed
                        - **Tests**: Number of test cases executed
                        - **Duration**: Average execution time in milliseconds
                        - **Cost**: Total cost in USD for this run
                        - **Submitted By**: HuggingFace username of evaluator

                        **Tips:**
                        - Use this for detailed data analysis
                        - Combine with sidebar filters for focused views
                        - Sort by any column to find best/worst performers
                        """)

                    # Inline filters for drilldown table
                    with gr.Row():
                        with gr.Column(scale=1):
                            drilldown_agent_type_filter = gr.Radio(
                                choices=["All", "tool", "code", "both"],
                                value="All",
                                label="Agent Type",
                                info="Filter by agent type"
                            )
                        with gr.Column(scale=1):
                            drilldown_provider_filter = gr.Dropdown(
                                choices=["All"],
                                value="All",
                                label="Provider",
                                info="Filter by provider"
                            )
                        with gr.Column(scale=1):
                            drilldown_sort_by_dropdown = gr.Dropdown(
                                choices=["success_rate", "total_cost_usd", "avg_duration_ms", "total_tokens"],
                                value="success_rate",
                                label="Sort By"
                            )
                        with gr.Column(scale=1):
                            drilldown_sort_order = gr.Radio(
                                choices=["Descending", "Ascending"],
                                value="Descending",
                                label="Sort Order"
                            )

                    with gr.Row():
                        apply_drilldown_filters_btn = gr.Button("🔍 Apply Filters", variant="primary", size="sm")

                    # Simple table controlled by inline filters
                    leaderboard_table = gr.Dataframe(
                        headers=["Run ID", "Model", "Agent Type", "Provider", "Success Rate", "Tests", "Duration (ms)", "Cost (USD)", "Submitted By"],
                        interactive=False,
                        wrap=True
                    )
    
                with gr.TabItem("📈 Trends"):
                    # User Guide Accordion
                    with gr.Accordion("📖 How to Read Trends", open=False):
                        gr.Markdown("""
                        ### 📈 Temporal Performance Analysis

                        **What is this tab?**
                        The Trends tab visualizes how model performance evolves over time, helping you identify patterns and improvements.

                        **How to read it:**
                        - 📅 **X-axis**: Timeline showing when evaluations were run
                        - 📊 **Y-axis**: Performance metrics (success rate, cost, duration, etc.)
                        - 📈 **Line Charts**: Each line represents a different model
                        - 🎨 **Color Coding**: Different colors for different models
                        - 🔍 **Interactive**: Hover over points to see exact values

                        **What to look for:**
                        - **Upward trends** = Model improvements over time
                        - **Downward trends** = Performance degradation (needs investigation)
                        - **Flat lines** = Consistent performance
                        - **Spikes** = Anomalies or special test conditions
                        - **Gaps** = Periods without evaluations

                        **Use cases:**
                        - Track model version improvements
                        - Identify when performance degraded
                        - Compare model evolution over time
                        - Spot patterns in cost or latency changes
                        - Validate optimization efforts

                        **Tips:**
                        - Use sidebar filters to focus on specific models
                        - Look for correlation between cost and accuracy
                        - Identify best time periods for each model
                        """)

                    trends_plot = gr.Plot()
    
                with gr.TabItem("📊 Analytics"):
                    viz_type = gr.Radio(
                        choices=["🔥 Performance Heatmap", "⚡ Speed vs Accuracy", "💰 Cost Efficiency"],
                        value="🔥 Performance Heatmap",
                        label="Select Visualization",
                        info="Choose which analytics chart to display"
                    )
                    analytics_chart = gr.Plot(label="Interactive Chart", show_label=False)

                    # Explanation panel in accordion (dynamically updates based on chart selection)
                    with gr.Accordion("💡 How to Read This Chart", open=False):
                        viz_explanation = gr.Markdown("""
                        #### 🔥 Performance Heatmap

                        **What it shows:** All models compared across all metrics in one view

                        **How to read it:**
                        - 🟢 **Green cells** = Better performance (higher is better)
                        - 🟡 **Yellow cells** = Average performance
                        - 🔴 **Red cells** = Worse performance (needs improvement)

                        **Metrics displayed:**
                        - Success Rate (%), Avg Duration (ms), Total Cost ($)
                        - CO2 Emissions (g), GPU Utilization (%), Total Tokens

                        **Use it to:** Quickly identify which models excel in which areas
                        """, elem_id="viz-explanation")

                with gr.TabItem("📥 Summary Card"):
                    # User Guide Accordion
                    with gr.Accordion("📖 How to Create Summary Cards", open=False):
                        gr.Markdown("""
                        ### 📥 Downloadable Leaderboard Summary Card

                        **What is this tab?**
                        Generate professional, shareable summary cards with top performers and key statistics.
                        Perfect for presentations, reports, and sharing results with your team!

                        **How to use it:**
                        1. **Select Top N**: Use the slider to choose how many top models to include (1-5)
                        2. **Generate Preview**: Click "Generate Card Preview" to see the card
                        3. **Download**: Click "Download as PNG" to save as high-quality image
                        4. **Share**: Use the downloaded image in presentations, reports, or social media

                        **Card Features:**
                        - 🏆 **Medal Indicators**: Gold, silver, bronze for top 3 performers
                        - 📊 **Key Metrics**: Success rate, cost, duration, and tokens per model
                        - 📈 **Aggregate Stats**: Overall leaderboard statistics at a glance
                        - 🎨 **TraceMind Branding**: Professional design with logo
                        - 📥 **High Quality**: PNG format suitable for presentations

                        **Best Practices:**
                        - Use 3-5 models for balanced card density
                        - Include metric context in your presentations
                        - Update cards regularly to reflect latest results
                        - Combine with detailed reports for stakeholders

                        **Tips:**
                        - Cards are automatically sized for readability
                        - All current sidebar filters are applied
                        - Cards update dynamically as data changes
                        """)

                    with gr.Row():
                        with gr.Column(scale=1):
                            top_n_slider = gr.Slider(
                                minimum=1,
                                maximum=5,
                                value=3,
                                step=1,
                                label="Number of top models to show",
                                info="Select how many top performers to include in the card"
                            )

                            with gr.Row():
                                generate_card_btn = gr.Button("🎨 Generate Card Preview", variant="secondary", size="lg")
                                download_card_btn = gr.Button("📥 Download as PNG", variant="primary", size="lg", visible=False)

                        with gr.Column(scale=2):
                            card_preview = gr.HTML(label="Card Preview", value="<p style='text-align: center; color: #666; padding: 40px;'>Click 'Generate Card Preview' to see your summary card</p>")
    
                with gr.TabItem("🤖 AI Insights"):
                    # User Guide Accordion
                    with gr.Accordion("📖 About AI Insights", open=False):
                        gr.Markdown("""
                        ### 🤖 LLM-Powered Leaderboard Analysis

                        **What is this tab?**
                        AI Insights provides intelligent, natural language analysis of your leaderboard data using advanced language models.
                        Get instant insights, trends, and recommendations powered by AI.

                        **How it works:**
                        - 📊 **Automatic Analysis**: AI analyzes all leaderboard data automatically
                        - 🔄 **Streaming Responses**: Watch insights generate in real-time (Gradio 6)
                        - 🎯 **Smart Recommendations**: Get actionable advice for model selection
                        - 📈 **Trend Detection**: AI identifies patterns and anomalies
                        - 💡 **Context-Aware**: Insights adapt to current filters and data

                        **What insights you'll get:**
                        - **Top Performers**: Which models lead in accuracy, speed, cost
                        - **Trade-offs**: Cost vs accuracy, speed vs quality analysis
                        - **Recommendations**: Best model for different use cases
                        - **Trends**: Performance changes over time
                        - **Anomalies**: Unusual results that need attention
                        - **Optimization Tips**: How to improve evaluation strategies

                        **Powered by:**
                        - 🤖 **MCP Servers**: Model Context Protocol for intelligent data access
                        - 🧠 **Advanced LLMs**: Google Gemini 1.5 Pro for analysis
                        - 📡 **Real-time Streaming**: Gradio 6 for live response generation
                        - 🔗 **Context Integration**: Understands your full leaderboard context

                        **Tips:**
                        - Click "Regenerate" for updated insights after data changes
                        - Insights respect your sidebar and inline filters
                        - Use insights to guide model selection decisions
                        - Share AI insights in team discussions
                        """)

                    with gr.Row():
                        regenerate_btn = gr.Button("🔄 Regenerate Insights (Streaming)", size="sm", variant="secondary")
                        gr.Markdown("*Real-time AI analysis powered by Gradio 6 streaming*", elem_classes=["text-sm"])
                    mcp_insights = gr.Markdown("*Loading insights...*")
    
            # Hidden textbox for row selection (JavaScript bridge)
            selected_row_index = gr.Textbox(visible=False, elem_id="selected_row_index")
    
        # Screen 3: Run Detail (Enhanced with Tabs)
        with gr.Column(visible=False) as run_detail_screen:
            # Navigation
            with gr.Row():
                back_to_leaderboard_btn = gr.Button("⬅️ Back to Leaderboard", variant="secondary", size="sm")
                download_run_card_btn = gr.Button("📥 Download Run Report Card", variant="secondary", size="sm")

            run_detail_title = gr.Markdown("# 📊 Run Detail")

            with gr.Tabs():
                with gr.TabItem("📋 Overview"):
                    gr.Markdown("*Run metadata and summary*")
                    run_metadata_html = gr.HTML("")

                with gr.TabItem("✅ Test Cases"):
                    gr.Markdown("*Individual test case results*")
                    test_cases_table = gr.Dataframe(
                        headers=["Task ID", "Status", "Tool", "Duration", "Tokens", "Cost", "Trace ID"],
                        interactive=False,
                        wrap=True
                    )
                    gr.Markdown("*Click a test case to view detailed trace (including Thought Graph)*")

                with gr.TabItem("⚡ Performance"):
                    gr.Markdown("*Performance metrics and charts*")
                    performance_charts = gr.Plot(label="Performance Analysis", show_label=False)

                with gr.TabItem("📄 Report Card"):
                    gr.Markdown("*Downloadable run summary card*")
                    run_card_html = gr.HTML(label="Run Report Card", value="<p style='text-align: center; color: #666; padding: 40px;'>Select a run to view its report card</p>")

        # Screen 4: Trace Detail with Sub-tabs
        with gr.Column(visible=False) as trace_detail_screen:
            with gr.Row():
                back_to_run_detail_btn = gr.Button("⬅️ Back to Run Detail", variant="secondary", size="sm")

            trace_title = gr.Markdown("# 🔍 Trace Detail")
            trace_metadata_html = gr.HTML("")

            with gr.Tabs():
                with gr.TabItem("🧠 Thought Graph"):
                    gr.Markdown("""
                    ### Agent Reasoning Flow

                    This interactive network graph shows **how your agent thinks** - the logical flow of reasoning steps,
                    tool calls, and LLM interactions.

                    **How to read it:**
                    - 🟣 **Purple nodes** = LLM reasoning steps
                    - 🟠 **Orange nodes** = Tool calls
                    - 🔵 **Blue nodes** = Chains/Agents
                    - **Arrows** = Flow from one step to the next
                    - **Hover** = See tokens, costs, and timing details
                    """)
                    trace_thought_graph = gr.Plot(label="Thought Graph", show_label=False)

                with gr.TabItem("📊 Waterfall"):
                    gr.Markdown("*Interactive waterfall diagram showing span execution timeline*")
                    gr.Markdown("*Hover over spans for details. Drag to zoom, double-click to reset.*")
                    span_visualization = gr.Plot(label="Trace Waterfall", show_label=False)

                with gr.TabItem("🖥️ GPU Metrics"):
                    gr.Markdown("*Performance metrics for GPU-based models (not available for API models)*")
                    gpu_summary_cards_html = gr.HTML(label="GPU Summary", show_label=False)

                    with gr.Tabs():
                        with gr.TabItem("📈 Time Series Dashboard"):
                            gpu_metrics_plot = gr.Plot(label="GPU Metrics Over Time", show_label=False)

                        with gr.TabItem("📋 Raw Metrics Data"):
                            gpu_metrics_json = gr.JSON(label="GPU Metrics Data")

                with gr.TabItem("📝 Span Details"):
                    gr.Markdown("*Detailed span information with token and cost data*")
                    span_details_table = gr.Dataframe(
                        headers=["Span Name", "Kind", "Duration (ms)", "Tokens", "Cost (USD)", "Status"],
                        interactive=False,
                        wrap=True,
                        label="Span Breakdown"
                    )

                with gr.TabItem("🔍 Raw Data"):
                    gr.Markdown("*Raw OpenTelemetry trace data (JSON)*")
                    span_details_json = gr.JSON()

            with gr.Accordion("🤖 Ask About This Trace", open=False):
                trace_question = gr.Textbox(
                    label="Question",
                    placeholder="e.g., Why was the tool called twice?",
                    lines=2
                )
                trace_ask_btn = gr.Button("Ask", variant="primary")
                trace_answer = gr.Markdown("*Ask a question to get AI-powered insights*")

        # Screen 5: Compare Screen
        compare_screen, compare_components = create_compare_ui()

        # Navigation handlers (define before use)
        def navigate_to_dashboard():
            """Navigate to dashboard screen and load dashboard data"""
            try:
                leaderboard_df = data_loader.load_leaderboard()
                dashboard_updates = update_dashboard_data(leaderboard_df, dashboard_components)
            except Exception as e:
                print(f"[ERROR] Loading dashboard data: {e}")
                dashboard_updates = {}

            # Combine navigation updates with dashboard data updates
            result = {
                dashboard_screen: gr.update(visible=True),
                leaderboard_screen: gr.update(visible=False),
                run_detail_screen: gr.update(visible=False),
                trace_detail_screen: gr.update(visible=False),
                compare_screen: gr.update(visible=False),
                dashboard_nav_btn: gr.update(variant="primary"),
                leaderboard_nav_btn: gr.update(variant="secondary"),
                compare_nav_btn: gr.update(variant="secondary"),
                docs_nav_btn: gr.update(variant="secondary"),
            }
            result.update(dashboard_updates)
            return result

        def navigate_to_leaderboard():
            """Navigate to leaderboard screen"""
            return {
                dashboard_screen: gr.update(visible=False),
                leaderboard_screen: gr.update(visible=True),
                run_detail_screen: gr.update(visible=False),
                trace_detail_screen: gr.update(visible=False),
                compare_screen: gr.update(visible=False),
                dashboard_nav_btn: gr.update(variant="secondary"),
                leaderboard_nav_btn: gr.update(variant="primary"),
                compare_nav_btn: gr.update(variant="secondary"),
                docs_nav_btn: gr.update(variant="secondary"),
            }

        def navigate_to_compare():
            """Navigate to compare screen and populate dropdown choices"""
            try:
                leaderboard_df = data_loader.load_leaderboard()

                # Create run choices for dropdowns (model name with composite unique identifier)
                run_choices = []
                for _, row in leaderboard_df.iterrows():
                    label = f"{row.get('model', 'Unknown')} - {row.get('timestamp', 'N/A')}"
                    # Use composite key: run_id|timestamp to ensure uniqueness
                    value = f"{row.get('run_id', '')}|{row.get('timestamp', '')}"
                    if value:
                        run_choices.append((label, value))

                return {
                    dashboard_screen: gr.update(visible=False),
                    leaderboard_screen: gr.update(visible=False),
                    run_detail_screen: gr.update(visible=False),
                    trace_detail_screen: gr.update(visible=False),
                    compare_screen: gr.update(visible=True),
                    dashboard_nav_btn: gr.update(variant="secondary"),
                    leaderboard_nav_btn: gr.update(variant="secondary"),
                    compare_nav_btn: gr.update(variant="primary"),
                    docs_nav_btn: gr.update(variant="secondary"),
                    compare_components['compare_run_a_dropdown']: gr.update(choices=run_choices),
                    compare_components['compare_run_b_dropdown']: gr.update(choices=run_choices),
                }
            except Exception as e:
                print(f"[ERROR] Navigating to compare: {e}")
                return {
                    dashboard_screen: gr.update(visible=False),
                    leaderboard_screen: gr.update(visible=False),
                    run_detail_screen: gr.update(visible=False),
                    trace_detail_screen: gr.update(visible=False),
                    compare_screen: gr.update(visible=True),
                    dashboard_nav_btn: gr.update(variant="secondary"),
                    leaderboard_nav_btn: gr.update(variant="secondary"),
                    compare_nav_btn: gr.update(variant="primary"),
                    docs_nav_btn: gr.update(variant="secondary"),
                }

        # Event handlers
        # Load dashboard on app start
        app.load(
            fn=navigate_to_dashboard,
            outputs=[
                dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
                dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
            ] + list(dashboard_components.values())
        )

        app.load(
        fn=load_leaderboard,
        outputs=[leaderboard_by_model, model_filter, model_filter, provider_filter]
        )

        app.load(
        fn=load_trends,
        outputs=[trends_plot]
        )

        # Load drilldown data on page load
        app.load(
        fn=load_drilldown,
        inputs=[drilldown_agent_type_filter, drilldown_provider_filter],
        outputs=[leaderboard_table]
        )

        # Refresh button handler
        refresh_leaderboard_btn.click(
        fn=refresh_leaderboard,
        outputs=[leaderboard_by_model, model_filter, model_filter]
        )

        # Leaderboard tab inline filters
        apply_filters_btn.click(
        fn=apply_leaderboard_filters,
        inputs=[agent_type_filter, provider_filter, sort_by_dropdown, sort_order],
        outputs=[leaderboard_by_model]
        )

        # DrillDown tab inline filters
        apply_drilldown_filters_btn.click(
        fn=apply_drilldown_filters,
        inputs=[drilldown_agent_type_filter, drilldown_provider_filter, drilldown_sort_by_dropdown, drilldown_sort_order],
        outputs=[leaderboard_table]
        )

        # Sidebar filters (apply to all tabs)
        model_filter.change(
        fn=apply_sidebar_filters,
        inputs=[model_filter, sidebar_agent_type_filter],
        outputs=[leaderboard_by_model, leaderboard_table, trends_plot, compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']]
        )

        sidebar_agent_type_filter.change(
        fn=apply_sidebar_filters,
        inputs=[model_filter, sidebar_agent_type_filter],
        outputs=[leaderboard_by_model, leaderboard_table, trends_plot, compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']]
        )


        viz_type.change(
        fn=update_analytics,
        inputs=[viz_type],
        outputs=[analytics_chart, viz_explanation]
        )

        app.load(
        fn=update_analytics,
        inputs=[viz_type],
        outputs=[analytics_chart, viz_explanation]
        )

        generate_card_btn.click(
        fn=generate_card,
        inputs=[top_n_slider],
        outputs=[card_preview, download_card_btn]
        )

        # Download leaderboard summary card as PNG
        download_card_btn.click(
            fn=None,
            js=download_card_as_png_js("summary-card-html")
        )

        app.load(
        fn=generate_insights,
        outputs=[mcp_insights]
        )

        regenerate_btn.click(
        fn=generate_insights,
        outputs=[mcp_insights]
        )

        # Wire up navigation buttons
        dashboard_nav_btn.click(
            fn=navigate_to_dashboard,
            outputs=[
                dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
                dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
            ] + list(dashboard_components.values())
        )

        leaderboard_nav_btn.click(
            fn=navigate_to_leaderboard,
            outputs=[
                dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
                dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
            ]
        )

        compare_nav_btn.click(
            fn=navigate_to_compare,
            outputs=[
                dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
                dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn,
                compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
            ]
        )

        # Compare button handler
        compare_components['compare_button'].click(
            fn=lambda run_a, run_b: on_compare_runs(run_a, run_b, leaderboard_df_cache, compare_components),
            inputs=[
                compare_components['compare_run_a_dropdown'],
                compare_components['compare_run_b_dropdown']
            ],
            outputs=[
                compare_components['comparison_output'],
                compare_components['run_a_card'],
                compare_components['run_b_card'],
                compare_components['comparison_charts'],
                compare_components['winner_summary'],
                compare_components['radar_comparison_chart']
            ]
        )

        # Back to leaderboard from compare
        compare_components['back_to_leaderboard_btn'].click(
            fn=navigate_to_leaderboard,
            outputs=[
                dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen,
                dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
            ]
        )

        leaderboard_table.select(
        fn=on_drilldown_select,
        inputs=[leaderboard_table],  # Pass dataframe to handler (like MockTraceMind)
        outputs=[leaderboard_screen, run_detail_screen, run_metadata_html, test_cases_table, performance_charts, run_card_html]
        )

        back_to_leaderboard_btn.click(
        fn=go_back_to_leaderboard,
        inputs=[],
        outputs=[leaderboard_screen, run_detail_screen]
        )

        # Trace detail navigation
        test_cases_table.select(
            fn=on_test_case_select,
            inputs=[test_cases_table],
            outputs=[
                run_detail_screen,
                trace_detail_screen,
                trace_title,
                trace_metadata_html,
                trace_thought_graph,
                span_visualization,
                span_details_table,
                span_details_json,
                gpu_summary_cards_html,
                gpu_metrics_plot,
                gpu_metrics_json
            ]
        )

        back_to_run_detail_btn.click(
            fn=go_back_to_run_detail,
            outputs=[run_detail_screen, trace_detail_screen]
        )


        # HTML table row click handler (JavaScript bridge via hidden textbox)
        selected_row_index.change(
        fn=on_html_table_row_click,
        inputs=[selected_row_index],
        outputs=[leaderboard_screen, run_detail_screen, run_metadata_html, test_cases_table, run_card_html, selected_row_index]
        )

        # Download run report card as PNG
        download_run_card_btn.click(
            fn=None,
            js=download_card_as_png_js(element_id="run-card-html")
        )


if __name__ == "__main__":
    print("Starting TraceMind-AI...")
    print(f"Data Source: {os.getenv('DATA_SOURCE', 'both')}")
    print(f"JSON Path: {os.getenv('JSON_DATA_PATH', './sample_data')}")

    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )