Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ import pandas as pd
|
|
| 12 |
leader_component_values = [None]
|
| 13 |
space = " "
|
| 14 |
|
| 15 |
-
def make_default_md(
|
| 16 |
leaderboard_md = f"""
|
| 17 |
# NeurIPS LLM Merging Competition Leaderboard
|
| 18 |
[Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
|
|
@@ -20,29 +20,20 @@ def make_default_md(arena_df, elo_results):
|
|
| 20 |
"""
|
| 21 |
return leaderboard_md
|
| 22 |
|
| 23 |
-
def make_arena_leaderboard_md(
|
| 24 |
total_models = len(arena_df)
|
| 25 |
leaderboard_md = f"""
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
-
Higher values are better
|
| 29 |
|
| 30 |
-
Total #models: **{total_models}**.{space}
|
| 31 |
|
| 32 |
"""
|
| 33 |
return leaderboard_md
|
| 34 |
|
| 35 |
|
| 36 |
-
|
| 37 |
-
def make_leaderboard_md_live(elo_results):
|
| 38 |
-
leaderboard_md = f"""
|
| 39 |
-
# Leaderboard
|
| 40 |
-
Last updated: {elo_results["last_updated_datetime"]}
|
| 41 |
-
{elo_results["leaderboard_table"]}
|
| 42 |
-
"""
|
| 43 |
-
return leaderboard_md
|
| 44 |
-
|
| 45 |
-
|
| 46 |
def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
| 47 |
lines = open(filename).readlines()
|
| 48 |
heads = [v.strip() for v in lines[0].split(",")]
|
|
@@ -52,47 +43,27 @@ def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
|
| 52 |
for j in range(len(heads)):
|
| 53 |
item = {}
|
| 54 |
for h, v in zip(heads, row):
|
| 55 |
-
if h == "
|
| 56 |
if v != "-":
|
| 57 |
v = int(ast.literal_eval(v))
|
| 58 |
else:
|
| 59 |
v = np.nan
|
| 60 |
-
elif h == "MMLU":
|
| 61 |
-
if v != "-":
|
| 62 |
-
v = round(ast.literal_eval(v) * 100, 1)
|
| 63 |
-
else:
|
| 64 |
-
v = np.nan
|
| 65 |
-
elif h == "MT-bench (win rate %)":
|
| 66 |
-
if v != "-":
|
| 67 |
-
v = round(ast.literal_eval(v[:-1]), 1)
|
| 68 |
-
else:
|
| 69 |
-
v = np.nan
|
| 70 |
-
elif h == "MT-bench (score)":
|
| 71 |
-
if v != "-":
|
| 72 |
-
v = round(ast.literal_eval(v), 2)
|
| 73 |
-
else:
|
| 74 |
-
v = np.nan
|
| 75 |
item[h] = v
|
| 76 |
if add_hyperlink:
|
| 77 |
-
item["Model"] = f'<a target="_blank"
|
| 78 |
rows.append(item)
|
| 79 |
return rows
|
| 80 |
|
| 81 |
-
def get_full_table(
|
| 82 |
values = []
|
| 83 |
for i in range(len(model_table_df)):
|
| 84 |
row = []
|
| 85 |
ranking = i+1
|
| 86 |
row.append(ranking)
|
| 87 |
-
model_key = model_table_df.iloc[i]["key"]
|
| 88 |
model_name = model_table_df.iloc[i]["Model"]
|
| 89 |
# model display name
|
| 90 |
row.append(model_name)
|
| 91 |
row.append(np.nan)
|
| 92 |
-
row.append(np.nan)
|
| 93 |
-
row.append(np.nan)
|
| 94 |
-
# Team
|
| 95 |
-
row.append(model_table_df.iloc[i]["Organization"])
|
| 96 |
|
| 97 |
values.append(row)
|
| 98 |
# values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
|
@@ -105,24 +76,13 @@ cat_name_to_explanation = {
|
|
| 105 |
"Overall": "Overall Questions",
|
| 106 |
}
|
| 107 |
|
| 108 |
-
def build_leaderboard_tab(
|
| 109 |
arena_dfs = {}
|
| 110 |
category_elo_results = {}
|
| 111 |
-
if
|
| 112 |
default_md = "Loading ..."
|
| 113 |
else:
|
| 114 |
-
|
| 115 |
-
elo_results = pickle.load(fin)
|
| 116 |
-
if "full" in elo_results:
|
| 117 |
-
print("KEYS ", elo_results.keys())
|
| 118 |
-
for k in elo_results.keys():
|
| 119 |
-
if k not in key_to_category_name:
|
| 120 |
-
continue
|
| 121 |
-
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
| 122 |
-
category_elo_results[key_to_category_name[k]] = elo_results[k]
|
| 123 |
-
|
| 124 |
-
arena_df = arena_dfs["Overall"]
|
| 125 |
-
default_md = make_default_md(arena_df, category_elo_results["Overall"])
|
| 126 |
|
| 127 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
| 128 |
if leaderboard_table_file:
|
|
@@ -130,35 +90,29 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
| 130 |
model_table_df = pd.DataFrame(data)
|
| 131 |
|
| 132 |
with gr.Tabs() as tabs:
|
| 133 |
-
arena_table_vals = get_full_table(
|
| 134 |
with gr.Tab("Full leaderboard", id=0):
|
| 135 |
-
md = make_arena_leaderboard_md(
|
| 136 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
| 137 |
with gr.Row():
|
| 138 |
with gr.Column(scale=2):
|
| 139 |
-
category_dropdown = gr.Dropdown(choices=
|
| 140 |
|
| 141 |
display_df = gr.Dataframe(
|
| 142 |
headers=[
|
| 143 |
"Rank",
|
| 144 |
-
"🤖 Model",
|
| 145 |
-
"⭐
|
| 146 |
-
"📈 Task 2",
|
| 147 |
-
"📚 Task 3",
|
| 148 |
-
"Team",
|
| 149 |
],
|
| 150 |
datatype=[
|
| 151 |
"number",
|
| 152 |
"markdown",
|
| 153 |
"number",
|
| 154 |
-
"number",
|
| 155 |
-
"number",
|
| 156 |
-
"str",
|
| 157 |
],
|
| 158 |
value=arena_table_vals,
|
| 159 |
elem_id="arena_leaderboard_dataframe",
|
| 160 |
height=700,
|
| 161 |
-
column_widths=[70, 190, 110
|
| 162 |
wrap=True,
|
| 163 |
)
|
| 164 |
|
|
@@ -179,53 +133,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
| 179 |
else:
|
| 180 |
pass
|
| 181 |
|
| 182 |
-
def update_leaderboard_df(arena_table_vals):
|
| 183 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "🤖 Model", "⭐ Task 1", "📈 Task 2", "📚 Task 3", "Team"])
|
| 184 |
-
|
| 185 |
-
# goal: color the rows based on the rank with styler
|
| 186 |
-
def highlight_max(s):
|
| 187 |
-
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
| 188 |
-
return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
|
| 189 |
-
|
| 190 |
-
def highlight_rank_max(s):
|
| 191 |
-
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
| 192 |
-
|
| 193 |
-
return elo_datarame.style.apply(highlight_max, subset=["Rank"])
|
| 194 |
-
|
| 195 |
-
def update_leaderboard_and_plots(category):
|
| 196 |
-
arena_subset_df = arena_dfs[category]
|
| 197 |
-
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
| 198 |
-
elo_subset_results = category_elo_results[category]
|
| 199 |
-
arena_df = arena_dfs["Overall"]
|
| 200 |
-
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
|
| 201 |
-
if category != "Overall":
|
| 202 |
-
arena_values = update_leaderboard_df(arena_values)
|
| 203 |
-
arena_values = gr.Dataframe(
|
| 204 |
-
headers=[
|
| 205 |
-
"Rank",
|
| 206 |
-
"🤖 Model",
|
| 207 |
-
"⭐ Task 1",
|
| 208 |
-
"📈 Task 2",
|
| 209 |
-
"📚 Task 3",
|
| 210 |
-
"Team",
|
| 211 |
-
],
|
| 212 |
-
datatype=[
|
| 213 |
-
"number",
|
| 214 |
-
"markdown",
|
| 215 |
-
"number",
|
| 216 |
-
"number",
|
| 217 |
-
"number",
|
| 218 |
-
"str",
|
| 219 |
-
],
|
| 220 |
-
value=arena_values,
|
| 221 |
-
elem_id="arena_leaderboard_dataframe",
|
| 222 |
-
height=700,
|
| 223 |
-
column_widths=[70, 190, 110, 110, 110, 150],
|
| 224 |
-
wrap=True,
|
| 225 |
-
)
|
| 226 |
-
return arena_values
|
| 227 |
-
|
| 228 |
-
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[display_df])
|
| 229 |
|
| 230 |
with gr.Accordion(
|
| 231 |
"📝 Citation",
|
|
@@ -239,8 +146,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
| 239 |
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
|
| 240 |
gr.Markdown(acknowledgment_md)
|
| 241 |
|
| 242 |
-
if show_plot:
|
| 243 |
-
return [md_1]
|
| 244 |
return [md_1]
|
| 245 |
|
| 246 |
|
|
@@ -318,7 +223,7 @@ We thank []() for their generous [sponsorship]().
|
|
| 318 |
</div>
|
| 319 |
"""
|
| 320 |
|
| 321 |
-
def build_demo(
|
| 322 |
text_size = gr.themes.sizes.text_lg
|
| 323 |
theme = gr.themes.Base(text_size=text_size)
|
| 324 |
theme.set(button_secondary_background_fill_hover="*primary_300",
|
|
@@ -330,7 +235,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
|
|
| 330 |
css=block_css,
|
| 331 |
) as demo:
|
| 332 |
leader_components = build_leaderboard_tab(
|
| 333 |
-
|
| 334 |
)
|
| 335 |
return demo
|
| 336 |
|
|
@@ -342,13 +247,10 @@ if __name__ == "__main__":
|
|
| 342 |
parser.add_argument("--port", type=int, default=7860)
|
| 343 |
args = parser.parse_args()
|
| 344 |
|
| 345 |
-
elo_result_files = glob.glob("elo_results_*.pkl")
|
| 346 |
-
elo_result_files.sort(key=lambda x: int(x[12:-4]))
|
| 347 |
-
elo_result_file = elo_result_files[-1]
|
| 348 |
|
| 349 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
| 350 |
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
| 351 |
leaderboard_table_file = leaderboard_table_files[-1]
|
| 352 |
|
| 353 |
-
demo = build_demo(
|
| 354 |
demo.launch(share=args.share, server_name=args.host, server_port=args.port)
|
|
|
|
| 12 |
leader_component_values = [None]
|
| 13 |
space = " "
|
| 14 |
|
| 15 |
+
def make_default_md():
|
| 16 |
leaderboard_md = f"""
|
| 17 |
# NeurIPS LLM Merging Competition Leaderboard
|
| 18 |
[Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
|
|
|
|
| 20 |
"""
|
| 21 |
return leaderboard_md
|
| 22 |
|
| 23 |
+
def make_arena_leaderboard_md(model_table_df):
|
| 24 |
total_models = len(arena_df)
|
| 25 |
leaderboard_md = f"""
|
| 26 |
+
Validation Benchmark Performance is averaged.
|
| 27 |
+
Final performance will be assessed at the end of the competition on a hidden test set, which may or may not be correlated with Validation performance.
|
| 28 |
|
| 29 |
+
Higher values are better.
|
| 30 |
|
| 31 |
+
Total #models: **{total_models}**.{space}
|
| 32 |
|
| 33 |
"""
|
| 34 |
return leaderboard_md
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
| 38 |
lines = open(filename).readlines()
|
| 39 |
heads = [v.strip() for v in lines[0].split(",")]
|
|
|
|
| 43 |
for j in range(len(heads)):
|
| 44 |
item = {}
|
| 45 |
for h, v in zip(heads, row):
|
| 46 |
+
if h == "Validation Score":
|
| 47 |
if v != "-":
|
| 48 |
v = int(ast.literal_eval(v))
|
| 49 |
else:
|
| 50 |
v = np.nan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
item[h] = v
|
| 52 |
if add_hyperlink:
|
| 53 |
+
item["Model"] = f'<a target="_blank" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{item["Model"]}</a>'
|
| 54 |
rows.append(item)
|
| 55 |
return rows
|
| 56 |
|
| 57 |
+
def get_full_table(model_table_df):
|
| 58 |
values = []
|
| 59 |
for i in range(len(model_table_df)):
|
| 60 |
row = []
|
| 61 |
ranking = i+1
|
| 62 |
row.append(ranking)
|
|
|
|
| 63 |
model_name = model_table_df.iloc[i]["Model"]
|
| 64 |
# model display name
|
| 65 |
row.append(model_name)
|
| 66 |
row.append(np.nan)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
values.append(row)
|
| 69 |
# values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
|
|
|
| 76 |
"Overall": "Overall Questions",
|
| 77 |
}
|
| 78 |
|
| 79 |
+
def build_leaderboard_tab(leaderboard_table_file, show_plot=False):
|
| 80 |
arena_dfs = {}
|
| 81 |
category_elo_results = {}
|
| 82 |
+
if leaderboard_table_file is None: # Do live update
|
| 83 |
default_md = "Loading ..."
|
| 84 |
else:
|
| 85 |
+
default_md = make_default_md()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
| 88 |
if leaderboard_table_file:
|
|
|
|
| 90 |
model_table_df = pd.DataFrame(data)
|
| 91 |
|
| 92 |
with gr.Tabs() as tabs:
|
| 93 |
+
arena_table_vals = get_full_table(model_table_df)
|
| 94 |
with gr.Tab("Full leaderboard", id=0):
|
| 95 |
+
md = make_arena_leaderboard_md(model_table_df)
|
| 96 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
| 97 |
with gr.Row():
|
| 98 |
with gr.Column(scale=2):
|
| 99 |
+
category_dropdown = gr.Dropdown(choices=["Overall"]), label="Category", value="Overall")
|
| 100 |
|
| 101 |
display_df = gr.Dataframe(
|
| 102 |
headers=[
|
| 103 |
"Rank",
|
| 104 |
+
"🤖 Model / Submission Name",
|
| 105 |
+
"⭐ Validation Performance",
|
|
|
|
|
|
|
|
|
|
| 106 |
],
|
| 107 |
datatype=[
|
| 108 |
"number",
|
| 109 |
"markdown",
|
| 110 |
"number",
|
|
|
|
|
|
|
|
|
|
| 111 |
],
|
| 112 |
value=arena_table_vals,
|
| 113 |
elem_id="arena_leaderboard_dataframe",
|
| 114 |
height=700,
|
| 115 |
+
column_widths=[70, 190, 110],
|
| 116 |
wrap=True,
|
| 117 |
)
|
| 118 |
|
|
|
|
| 133 |
else:
|
| 134 |
pass
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
with gr.Accordion(
|
| 138 |
"📝 Citation",
|
|
|
|
| 146 |
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
|
| 147 |
gr.Markdown(acknowledgment_md)
|
| 148 |
|
|
|
|
|
|
|
| 149 |
return [md_1]
|
| 150 |
|
| 151 |
|
|
|
|
| 223 |
</div>
|
| 224 |
"""
|
| 225 |
|
| 226 |
+
def build_demo(leaderboard_table_file):
|
| 227 |
text_size = gr.themes.sizes.text_lg
|
| 228 |
theme = gr.themes.Base(text_size=text_size)
|
| 229 |
theme.set(button_secondary_background_fill_hover="*primary_300",
|
|
|
|
| 235 |
css=block_css,
|
| 236 |
) as demo:
|
| 237 |
leader_components = build_leaderboard_tab(
|
| 238 |
+
leaderboard_table_file, show_plot=True
|
| 239 |
)
|
| 240 |
return demo
|
| 241 |
|
|
|
|
| 247 |
parser.add_argument("--port", type=int, default=7860)
|
| 248 |
args = parser.parse_args()
|
| 249 |
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
| 252 |
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
| 253 |
leaderboard_table_file = leaderboard_table_files[-1]
|
| 254 |
|
| 255 |
+
demo = build_demo(leaderboard_table_file)
|
| 256 |
demo.launch(share=args.share, server_name=args.host, server_port=args.port)
|