Fixed model ranking to estimate missing ranks
Browse files
app.py
CHANGED
|
@@ -334,6 +334,133 @@ def filter_target_benchmarks(df):
|
|
| 334 |
|
| 335 |
return df[available_benchmarks].copy()
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
def main():
|
| 338 |
"""Main application."""
|
| 339 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
|
@@ -707,25 +834,119 @@ def show_model_performance(df):
|
|
| 707 |
# Performance ranking
|
| 708 |
st.subheader("Model Rankings")
|
| 709 |
|
| 710 |
-
#
|
| 711 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
col1, col2 = st.columns(2)
|
| 715 |
|
| 716 |
with col1:
|
| 717 |
-
st.markdown("**π Top
|
| 718 |
-
|
| 719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
|
| 721 |
with col2:
|
| 722 |
-
st.markdown("**π
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
st.plotly_chart(fig, use_container_width=True)
|
| 727 |
|
| 728 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
st.subheader("Model Comparison")
|
| 730 |
|
| 731 |
# Benchmark selection for radar chart (always visible)
|
|
@@ -771,11 +992,11 @@ def show_model_performance(df):
|
|
| 771 |
available_models_for_selection = df_display.index.tolist()
|
| 772 |
models_info = f"({len(available_models_for_selection)} models total)"
|
| 773 |
|
| 774 |
-
# Model selection with filtered list
|
| 775 |
if available_models_for_selection:
|
| 776 |
-
# Get top performers from
|
| 777 |
-
|
| 778 |
-
default_selection =
|
| 779 |
else:
|
| 780 |
default_selection = []
|
| 781 |
|
|
|
|
| 334 |
|
| 335 |
return df[available_benchmarks].copy()
|
| 336 |
|
| 337 |
+
def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
|
| 338 |
+
"""
|
| 339 |
+
Estimate missing benchmark ranks using rank correlation-based imputation.
|
| 340 |
+
|
| 341 |
+
Args:
|
| 342 |
+
df: DataFrame with models as rows and benchmarks as columns
|
| 343 |
+
method: Rank correlation method ('spearman' or 'kendall')
|
| 344 |
+
min_corr: Minimum correlation threshold to use for prediction
|
| 345 |
+
min_benchmarks: Minimum number of benchmarks needed for prediction
|
| 346 |
+
|
| 347 |
+
Returns:
|
| 348 |
+
DataFrame with estimated ranks filled in
|
| 349 |
+
"""
|
| 350 |
+
# Convert scores to ranks (higher score = better rank = lower rank number)
|
| 351 |
+
df_ranks = df.rank(method='min', ascending=False, na_option='keep')
|
| 352 |
+
df_ranks_imputed = df_ranks.copy()
|
| 353 |
+
|
| 354 |
+
# Compute rank correlation matrix
|
| 355 |
+
if method == 'spearman':
|
| 356 |
+
rank_corr_matrix = df_ranks.corr(method='spearman')
|
| 357 |
+
elif method == 'kendall':
|
| 358 |
+
rank_corr_matrix = df_ranks.corr(method='kendall')
|
| 359 |
+
else:
|
| 360 |
+
rank_corr_matrix = df_ranks.corr(method='pearson') # fallback
|
| 361 |
+
|
| 362 |
+
# For each model and benchmark combination with missing data
|
| 363 |
+
for model_idx in df.index:
|
| 364 |
+
for benchmark in df.columns:
|
| 365 |
+
if pd.isna(df_ranks.loc[model_idx, benchmark]):
|
| 366 |
+
# Find benchmarks this model has ranks for
|
| 367 |
+
available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
|
| 368 |
+
|
| 369 |
+
if len(available_benchmarks) >= min_benchmarks:
|
| 370 |
+
# Get rank correlations between target benchmark and available benchmarks
|
| 371 |
+
correlations = []
|
| 372 |
+
ranks = []
|
| 373 |
+
|
| 374 |
+
for avail_bench in available_benchmarks:
|
| 375 |
+
corr_val = rank_corr_matrix.loc[benchmark, avail_bench]
|
| 376 |
+
if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
|
| 377 |
+
correlations.append(abs(corr_val)) # Use absolute correlation as weight
|
| 378 |
+
ranks.append(df_ranks.loc[model_idx, avail_bench])
|
| 379 |
+
|
| 380 |
+
if len(correlations) > 0:
|
| 381 |
+
# Weighted average of ranks using correlations as weights
|
| 382 |
+
correlations = np.array(correlations)
|
| 383 |
+
ranks = np.array(ranks)
|
| 384 |
+
|
| 385 |
+
# Normalize weights
|
| 386 |
+
weights = correlations / correlations.sum()
|
| 387 |
+
estimated_rank = np.average(ranks, weights=weights)
|
| 388 |
+
|
| 389 |
+
df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
|
| 390 |
+
|
| 391 |
+
return df_ranks_imputed
|
| 392 |
+
|
| 393 |
+
def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
|
| 394 |
+
"""
|
| 395 |
+
Create a consensus ranking using rank correlation-based estimation.
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
tuple: (ranking_df, rank_matrix, metadata)
|
| 399 |
+
"""
|
| 400 |
+
if use_rank_imputation:
|
| 401 |
+
# Estimate missing ranks
|
| 402 |
+
df_ranks = estimate_missing_ranks(df, method)
|
| 403 |
+
|
| 404 |
+
# Calculate consensus rank for each model (median rank across all benchmarks)
|
| 405 |
+
consensus_ranks = df_ranks.median(axis=1, skipna=True)
|
| 406 |
+
|
| 407 |
+
# Calculate coverage and estimation statistics
|
| 408 |
+
original_coverage = df.notna().sum(axis=1)
|
| 409 |
+
imputed_coverage = df_ranks.notna().sum(axis=1)
|
| 410 |
+
estimated_count = imputed_coverage - original_coverage
|
| 411 |
+
|
| 412 |
+
# Create ranking dataframe
|
| 413 |
+
ranking_data = []
|
| 414 |
+
for model in df.index:
|
| 415 |
+
ranking_data.append({
|
| 416 |
+
'Model': model.split('/')[-1] if '/' in model else model,
|
| 417 |
+
'Full_Model_Name': model,
|
| 418 |
+
'Consensus_Rank': float(consensus_ranks[model]),
|
| 419 |
+
'Original_Benchmarks': int(original_coverage[model]),
|
| 420 |
+
'Total_Benchmarks': int(imputed_coverage[model]),
|
| 421 |
+
'Estimated_Ranks': int(estimated_count[model]),
|
| 422 |
+
'Coverage_Pct': float(original_coverage[model] / len(df.columns) * 100)
|
| 423 |
+
})
|
| 424 |
+
|
| 425 |
+
ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True) # Lower rank = better
|
| 426 |
+
|
| 427 |
+
metadata = {
|
| 428 |
+
'method': method,
|
| 429 |
+
'imputation_used': True,
|
| 430 |
+
'total_estimates': int(estimated_count.sum()),
|
| 431 |
+
'models_with_estimates': int((estimated_count > 0).sum()),
|
| 432 |
+
'ranking_method': 'consensus_rank'
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
else:
|
| 436 |
+
# Simple ranking based on available data only
|
| 437 |
+
df_ranks = df.rank(method='min', ascending=False, na_option='keep')
|
| 438 |
+
median_ranks = df_ranks.median(axis=1, skipna=True)
|
| 439 |
+
|
| 440 |
+
ranking_data = []
|
| 441 |
+
for model in df.index:
|
| 442 |
+
ranking_data.append({
|
| 443 |
+
'Model': model.split('/')[-1] if '/' in model else model,
|
| 444 |
+
'Full_Model_Name': model,
|
| 445 |
+
'Consensus_Rank': float(median_ranks[model]),
|
| 446 |
+
'Original_Benchmarks': int(df.notna().sum(axis=1)[model]),
|
| 447 |
+
'Total_Benchmarks': int(df.notna().sum(axis=1)[model]),
|
| 448 |
+
'Estimated_Ranks': 0,
|
| 449 |
+
'Coverage_Pct': float(df.notna().sum(axis=1)[model] / len(df.columns) * 100)
|
| 450 |
+
})
|
| 451 |
+
|
| 452 |
+
ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)
|
| 453 |
+
|
| 454 |
+
metadata = {
|
| 455 |
+
'method': 'none',
|
| 456 |
+
'imputation_used': False,
|
| 457 |
+
'total_estimates': 0,
|
| 458 |
+
'models_with_estimates': 0,
|
| 459 |
+
'ranking_method': 'median_rank'
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
return ranking_df, df_ranks, metadata
|
| 463 |
+
|
| 464 |
def main():
|
| 465 |
"""Main application."""
|
| 466 |
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
|
|
|
| 834 |
# Performance ranking
|
| 835 |
st.subheader("Model Rankings")
|
| 836 |
|
| 837 |
+
# Ranking method controls
|
| 838 |
+
col1, col2, col3 = st.columns(3)
|
| 839 |
+
|
| 840 |
+
with col1:
|
| 841 |
+
use_rank_imputation = st.checkbox(
|
| 842 |
+
"Use rank-based estimation",
|
| 843 |
+
value=True,
|
| 844 |
+
help="Estimate missing rankings using rank correlations between benchmarks. More fair than simple averaging."
|
| 845 |
+
)
|
| 846 |
|
| 847 |
+
with col2:
|
| 848 |
+
if use_rank_imputation:
|
| 849 |
+
rank_method = st.selectbox(
|
| 850 |
+
"Rank correlation method",
|
| 851 |
+
["spearman", "kendall"],
|
| 852 |
+
help="Spearman: More sensitive to monotonic relationships\nKendall: More robust to outliers"
|
| 853 |
+
)
|
| 854 |
+
else:
|
| 855 |
+
rank_method = "none"
|
| 856 |
+
|
| 857 |
+
with col3:
|
| 858 |
+
if use_rank_imputation:
|
| 859 |
+
min_corr = st.slider(
|
| 860 |
+
"Min correlation threshold",
|
| 861 |
+
min_value=0.1,
|
| 862 |
+
max_value=0.8,
|
| 863 |
+
value=0.3,
|
| 864 |
+
step=0.1,
|
| 865 |
+
help="Minimum rank correlation required to use a benchmark for prediction"
|
| 866 |
+
)
|
| 867 |
+
else:
|
| 868 |
+
min_corr = 0.3
|
| 869 |
+
|
| 870 |
+
# Generate rankings
|
| 871 |
+
ranking_df, rank_matrix, metadata = create_consensus_ranking(
|
| 872 |
+
df_display,
|
| 873 |
+
method=rank_method,
|
| 874 |
+
use_rank_imputation=use_rank_imputation
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
# Display ranking information
|
| 878 |
col1, col2 = st.columns(2)
|
| 879 |
|
| 880 |
with col1:
|
| 881 |
+
st.markdown("**π Top 15 Models**")
|
| 882 |
+
|
| 883 |
+
if metadata['imputation_used']:
|
| 884 |
+
st.caption(f"π¬ Using {metadata['method']} rank correlations with {metadata['total_estimates']} estimated ranks")
|
| 885 |
+
else:
|
| 886 |
+
st.caption("π Using median rank of available rankings")
|
| 887 |
+
|
| 888 |
+
rank_num = 0
|
| 889 |
+
for i, row in ranking_df.head(15).iterrows():
|
| 890 |
+
rank_num += 1
|
| 891 |
+
estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
|
| 892 |
+
coverage_info = f"{row['Coverage_Pct']:.0f}%"
|
| 893 |
+
|
| 894 |
+
if metadata['imputation_used']:
|
| 895 |
+
st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
|
| 896 |
+
st.caption(f" π {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
|
| 897 |
+
else:
|
| 898 |
+
st.write(f"{rank_num}. **{row['Model']}** (median rank: {row['Consensus_Rank']:.1f})")
|
| 899 |
+
st.caption(f" π {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")
|
| 900 |
|
| 901 |
with col2:
|
| 902 |
+
st.markdown("**π Ranking Distribution**")
|
| 903 |
+
|
| 904 |
+
# Create histogram of consensus ranks
|
| 905 |
+
fig = px.histogram(
|
| 906 |
+
ranking_df,
|
| 907 |
+
x='Consensus_Rank',
|
| 908 |
+
nbins=20,
|
| 909 |
+
title="Distribution of Consensus Rankings",
|
| 910 |
+
labels={'Consensus_Rank': 'Average Rank (lower is better)', 'count': 'Number of Models'}
|
| 911 |
+
)
|
| 912 |
+
fig.update_layout(height=400)
|
| 913 |
st.plotly_chart(fig, use_container_width=True)
|
| 914 |
|
| 915 |
+
# Show ranking methodology explanation
|
| 916 |
+
if metadata['imputation_used']:
|
| 917 |
+
with st.expander("βΉοΈ How Rank-Based Estimation Works"):
|
| 918 |
+
st.write(f"""
|
| 919 |
+
**Method**: {metadata['method'].title()} rank correlation
|
| 920 |
+
|
| 921 |
+
**Process**:
|
| 922 |
+
1. Convert benchmark scores to ranks (1st, 2nd, 3rd, etc.)
|
| 923 |
+
2. Calculate rank correlations between all benchmark pairs
|
| 924 |
+
3. For missing data: predict rank using weighted average of available ranks
|
| 925 |
+
4. Weights based on rank correlation strength (min threshold: {min_corr})
|
| 926 |
+
5. Final consensus rank = median rank across all benchmarks
|
| 927 |
+
|
| 928 |
+
**Upsides**:
|
| 929 |
+
- Eliminates bias from models tested only on easier/harder benchmarks
|
| 930 |
+
- Uses the correlation structure to make informed predictions
|
| 931 |
+
- Focuses on relative ranking rather than absolute scores
|
| 932 |
+
- More robust to outliers and scale differences
|
| 933 |
+
- Median consensus rank is less affected by extreme outlier rankings
|
| 934 |
+
|
| 935 |
+
**Statistics**:
|
| 936 |
+
- Total rank estimates made: {metadata['total_estimates']:,}
|
| 937 |
+
- Models with estimated ranks: {metadata['models_with_estimates']}
|
| 938 |
+
""")
|
| 939 |
+
else:
|
| 940 |
+
with st.expander("βΉοΈ Simple Ranking Method"):
|
| 941 |
+
st.write("""
|
| 942 |
+
**Method**: Median rank of available rankings
|
| 943 |
+
|
| 944 |
+
**Limitation**: Models tested on fewer or easier benchmarks may appear artificially better.
|
| 945 |
+
|
| 946 |
+
**Recommendation**: Enable rank-based estimation for fairer comparisons.
|
| 947 |
+
""")
|
| 948 |
+
|
| 949 |
+
# Model comparison section
|
| 950 |
st.subheader("Model Comparison")
|
| 951 |
|
| 952 |
# Benchmark selection for radar chart (always visible)
|
|
|
|
| 992 |
available_models_for_selection = df_display.index.tolist()
|
| 993 |
models_info = f"({len(available_models_for_selection)} models total)"
|
| 994 |
|
| 995 |
+
# Model selection with filtered list - use top ranked models as default
|
| 996 |
if available_models_for_selection:
|
| 997 |
+
# Get top performers from ranking
|
| 998 |
+
top_models_from_ranking = ranking_df['Full_Model_Name'].head(5).tolist()
|
| 999 |
+
default_selection = [m for m in top_models_from_ranking if m in available_models_for_selection][:3]
|
| 1000 |
else:
|
| 1001 |
default_selection = []
|
| 1002 |
|