Spaces:

mlfoundations
/

OpenThoughts_data_explorer

Running

App Files Files Community

jmercat commited on Jun 2

Commit

217c20b

1 Parent(s): b1b519f

Fixed model ranking to estimate missing ranks

Browse files

Files changed (1) hide show

app.py +236 -15

app.py CHANGED Viewed

@@ -334,6 +334,133 @@ def filter_target_benchmarks(df):
     return df[available_benchmarks].copy()
 def main():
     """Main application."""
     st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
@@ -707,25 +834,119 @@ def show_model_performance(df):
     # Performance ranking
     st.subheader("Model Rankings")
-    # Calculate average performance (excluding NaN)
-    model_avg_scores = df_display.mean(axis=1, skipna=True).sort_values(ascending=False)
-    # Top performers
     col1, col2 = st.columns(2)
     with col1:
-        st.markdown("**🏆 Top 10 Models (by average score)**")
-        for i, (model, score) in enumerate(model_avg_scores.head(10).items()):
-            st.write(f"{i+1}. {model.split('/')[-1]}: {score:.3f}")
     with col2:
-        st.markdown("**📊 Performance Distribution**")
-        fig = px.histogram(model_avg_scores,
-                          nbins=20,
-                          title="Distribution of Average Model Scores")
         st.plotly_chart(fig, use_container_width=True)
-    # Model comparison
     st.subheader("Model Comparison")
     # Benchmark selection for radar chart (always visible)
@@ -771,11 +992,11 @@ def show_model_performance(df):
         available_models_for_selection = df_display.index.tolist()
         models_info = f"({len(available_models_for_selection)} models total)"
-    # Model selection with filtered list
     if available_models_for_selection:
-        # Get top performers from available models for default selection
-        available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
-        default_selection = available_model_avg_scores.head(3).index.tolist()
     else:
         default_selection = []

     return df[available_benchmarks].copy()
+def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
+    """
+    Estimate missing benchmark ranks using rank correlation-based imputation.
+    Args:
+        df: DataFrame with models as rows and benchmarks as columns
+        method: Rank correlation method ('spearman' or 'kendall')
+        min_corr: Minimum correlation threshold to use for prediction
+        min_benchmarks: Minimum number of benchmarks needed for prediction
+    Returns:
+        DataFrame with estimated ranks filled in
+    """
+    # Convert scores to ranks (higher score = better rank = lower rank number)
+    df_ranks = df.rank(method='min', ascending=False, na_option='keep')
+    df_ranks_imputed = df_ranks.copy()
+    # Compute rank correlation matrix
+    if method == 'spearman':
+        rank_corr_matrix = df_ranks.corr(method='spearman')
+    elif method == 'kendall':
+        rank_corr_matrix = df_ranks.corr(method='kendall')
+    else:
+        rank_corr_matrix = df_ranks.corr(method='pearson')  # fallback
+    # For each model and benchmark combination with missing data
+    for model_idx in df.index:
+        for benchmark in df.columns:
+            if pd.isna(df_ranks.loc[model_idx, benchmark]):
+                # Find benchmarks this model has ranks for
+                available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
+                if len(available_benchmarks) >= min_benchmarks:
+                    # Get rank correlations between target benchmark and available benchmarks
+                    correlations = []
+                    ranks = []
+                    for avail_bench in available_benchmarks:
+                        corr_val = rank_corr_matrix.loc[benchmark, avail_bench]
+                        if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
+                            correlations.append(abs(corr_val))  # Use absolute correlation as weight
+                            ranks.append(df_ranks.loc[model_idx, avail_bench])
+                    if len(correlations) > 0:
+                        # Weighted average of ranks using correlations as weights
+                        correlations = np.array(correlations)
+                        ranks = np.array(ranks)
+                        # Normalize weights
+                        weights = correlations / correlations.sum()
+                        estimated_rank = np.average(ranks, weights=weights)
+                        df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
+    return df_ranks_imputed
+def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
+    """
+    Create a consensus ranking using rank correlation-based estimation.
+    Returns:
+        tuple: (ranking_df, rank_matrix, metadata)
+    """
+    if use_rank_imputation:
+        # Estimate missing ranks
+        df_ranks = estimate_missing_ranks(df, method)
+        # Calculate consensus rank for each model (median rank across all benchmarks)
+        consensus_ranks = df_ranks.median(axis=1, skipna=True)
+        # Calculate coverage and estimation statistics
+        original_coverage = df.notna().sum(axis=1)
+        imputed_coverage = df_ranks.notna().sum(axis=1)
+        estimated_count = imputed_coverage - original_coverage
+        # Create ranking dataframe
+        ranking_data = []
+        for model in df.index:
+            ranking_data.append({
+                'Model': model.split('/')[-1] if '/' in model else model,
+                'Full_Model_Name': model,
+                'Consensus_Rank': float(consensus_ranks[model]),
+                'Original_Benchmarks': int(original_coverage[model]),
+                'Total_Benchmarks': int(imputed_coverage[model]),
+                'Estimated_Ranks': int(estimated_count[model]),
+                'Coverage_Pct': float(original_coverage[model] / len(df.columns) * 100)
+            })
+        ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)  # Lower rank = better
+        metadata = {
+            'method': method,
+            'imputation_used': True,
+            'total_estimates': int(estimated_count.sum()),
+            'models_with_estimates': int((estimated_count > 0).sum()),
+            'ranking_method': 'consensus_rank'
+        }
+    else:
+        # Simple ranking based on available data only
+        df_ranks = df.rank(method='min', ascending=False, na_option='keep')
+        median_ranks = df_ranks.median(axis=1, skipna=True)
+        ranking_data = []
+        for model in df.index:
+            ranking_data.append({
+                'Model': model.split('/')[-1] if '/' in model else model,
+                'Full_Model_Name': model,
+                'Consensus_Rank': float(median_ranks[model]),
+                'Original_Benchmarks': int(df.notna().sum(axis=1)[model]),
+                'Total_Benchmarks': int(df.notna().sum(axis=1)[model]),
+                'Estimated_Ranks': 0,
+                'Coverage_Pct': float(df.notna().sum(axis=1)[model] / len(df.columns) * 100)
+            })
+        ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)
+        metadata = {
+            'method': 'none',
+            'imputation_used': False,
+            'total_estimates': 0,
+            'models_with_estimates': 0,
+            'ranking_method': 'median_rank'
+        }
+    return ranking_df, df_ranks, metadata
 def main():
     """Main application."""
     st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
     # Performance ranking
     st.subheader("Model Rankings")
+    # Ranking method controls
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        use_rank_imputation = st.checkbox(
+            "Use rank-based estimation",
+            value=True,
+            help="Estimate missing rankings using rank correlations between benchmarks. More fair than simple averaging."
+        )
+    with col2:
+        if use_rank_imputation:
+            rank_method = st.selectbox(
+                "Rank correlation method",
+                ["spearman", "kendall"],
+                help="Spearman: More sensitive to monotonic relationships\nKendall: More robust to outliers"
+            )
+        else:
+            rank_method = "none"
+    with col3:
+        if use_rank_imputation:
+            min_corr = st.slider(
+                "Min correlation threshold",
+                min_value=0.1,
+                max_value=0.8,
+                value=0.3,
+                step=0.1,
+                help="Minimum rank correlation required to use a benchmark for prediction"
+            )
+        else:
+            min_corr = 0.3
+    # Generate rankings
+    ranking_df, rank_matrix, metadata = create_consensus_ranking(
+        df_display,
+        method=rank_method,
+        use_rank_imputation=use_rank_imputation
+    )
+    # Display ranking information
     col1, col2 = st.columns(2)
     with col1:
+        st.markdown("**🏆 Top 15 Models**")
+        if metadata['imputation_used']:
+            st.caption(f"🔬 Using {metadata['method']} rank correlations with {metadata['total_estimates']} estimated ranks")
+        else:
+            st.caption("📊 Using median rank of available rankings")
+        rank_num = 0
+        for i, row in ranking_df.head(15).iterrows():
+            rank_num += 1
+            estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
+            coverage_info = f"{row['Coverage_Pct']:.0f}%"
+            if metadata['imputation_used']:
+                st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
+                st.caption(f"   📊 {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
+            else:
+                st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
+                st.caption(f"   📊 {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")
     with col2:
+        st.markdown("**📊 Ranking Distribution**")
+        # Create histogram of consensus ranks
+        fig = px.histogram(
+            ranking_df,
+            x='Consensus_Rank',
+            nbins=20,
+            title="Distribution of Consensus Rankings",
+            labels={'Consensus_Rank': 'Average Rank (lower is better)', 'count': 'Number of Models'}
+        )
+        fig.update_layout(height=400)
         st.plotly_chart(fig, use_container_width=True)
+    # Show ranking methodology explanation
+    if metadata['imputation_used']:
+        with st.expander("ℹ️ How Rank-Based Estimation Works"):
+            st.write(f"""
+            **Method**: {metadata['method'].title()} rank correlation
+            **Process**:
+            1. Convert benchmark scores to ranks (1st, 2nd, 3rd, etc.)
+            2. Calculate rank correlations between all benchmark pairs
+            3. For missing data: predict rank using weighted average of available ranks
+            4. Weights based on rank correlation strength (min threshold: {min_corr})
+            5. Final consensus rank = median rank across all benchmarks
+            **Upsides**:
+            - Eliminates bias from models tested only on easier/harder benchmarks
+            - Uses the correlation structure to make informed predictions
+            - Focuses on relative ranking rather than absolute scores
+            - More robust to outliers and scale differences
+            - Median consensus rank is less affected by extreme outlier rankings
+            **Statistics**:
+            - Total rank estimates made: {metadata['total_estimates']:,}
+            - Models with estimated ranks: {metadata['models_with_estimates']}
+            """)
+    else:
+        with st.expander("ℹ️ Simple Ranking Method"):
+            st.write("""
+            **Method**: Median rank of available rankings
+            **Limitation**: Models tested on fewer or easier benchmarks may appear artificially better.
+            **Recommendation**: Enable rank-based estimation for fairer comparisons.
+            """)
+    # Model comparison section
     st.subheader("Model Comparison")
     # Benchmark selection for radar chart (always visible)
         available_models_for_selection = df_display.index.tolist()
         models_info = f"({len(available_models_for_selection)} models total)"
+    # Model selection with filtered list - use top ranked models as default
     if available_models_for_selection:
+        # Get top performers from ranking
+        top_models_from_ranking = ranking_df['Full_Model_Name'].head(5).tolist()
+        default_selection = [m for m in top_models_from_ranking if m in available_models_for_selection][:3]
     else:
         default_selection = []