Spaces:
Running
Running
remove legacy cost tablea tab
Browse files- app.py +3 -148
- src/populate.py +2 -5
app.py
CHANGED
|
@@ -1,36 +1,15 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
from src.about import
|
| 5 |
-
CITATION_BUTTON_LABEL,
|
| 6 |
-
CITATION_BUTTON_TEXT,
|
| 7 |
-
INTRODUCTION_TEXT,
|
| 8 |
-
LLM_BENCHMARKS_TEXT,
|
| 9 |
-
TITLE,
|
| 10 |
-
)
|
| 11 |
from src.display.css_html_js import custom_css
|
| 12 |
-
from src.display.utils import
|
| 13 |
-
COLS,
|
| 14 |
-
COST_COLS,
|
| 15 |
-
COST_TYPES,
|
| 16 |
-
TS_COLS,
|
| 17 |
-
TS_TYPES,
|
| 18 |
-
TYPES,
|
| 19 |
-
AutoEvalColumn,
|
| 20 |
-
CostEvalColumn,
|
| 21 |
-
TSEvalColumn,
|
| 22 |
-
fields,
|
| 23 |
-
)
|
| 24 |
-
|
| 25 |
-
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 26 |
from src.envs import CRM_RESULTS_PATH
|
| 27 |
from src.populate import get_leaderboard_df_crm
|
| 28 |
|
| 29 |
-
original_df,
|
| 30 |
|
| 31 |
-
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 32 |
leaderboard_df = original_df.copy()
|
| 33 |
-
leaderboard_cost_df = cost_df.copy()
|
| 34 |
leaderboard_ts_df = ts_df.copy()
|
| 35 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
| 36 |
|
|
@@ -60,20 +39,6 @@ def update_table(
|
|
| 60 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 61 |
|
| 62 |
|
| 63 |
-
def update_cost_table(
|
| 64 |
-
hidden_df: pd.DataFrame,
|
| 65 |
-
columns: list,
|
| 66 |
-
llm_query: list,
|
| 67 |
-
llm_provider_query: list,
|
| 68 |
-
use_case_flavor_query: list,
|
| 69 |
-
):
|
| 70 |
-
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 71 |
-
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
| 72 |
-
filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
|
| 73 |
-
df = select_columns_cost_table(filtered_df, columns)
|
| 74 |
-
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 75 |
-
|
| 76 |
-
|
| 77 |
def update_ts_table(
|
| 78 |
hidden_df: pd.DataFrame,
|
| 79 |
columns: list,
|
|
@@ -125,23 +90,6 @@ def init_leaderboard_df(
|
|
| 125 |
)
|
| 126 |
|
| 127 |
|
| 128 |
-
def init_leaderboard_cost_df(
|
| 129 |
-
leaderboard_df: pd.DataFrame,
|
| 130 |
-
columns: list,
|
| 131 |
-
llm_query: list,
|
| 132 |
-
llm_provider_query: list,
|
| 133 |
-
use_case_type_query: list,
|
| 134 |
-
):
|
| 135 |
-
|
| 136 |
-
return update_cost_table(
|
| 137 |
-
leaderboard_df,
|
| 138 |
-
columns,
|
| 139 |
-
llm_query,
|
| 140 |
-
llm_provider_query,
|
| 141 |
-
use_case_type_query,
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
|
| 145 |
def init_leaderboard_ts_df(
|
| 146 |
leaderboard_df: pd.DataFrame,
|
| 147 |
columns: list,
|
|
@@ -183,10 +131,6 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
|
|
| 183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
| 184 |
|
| 185 |
|
| 186 |
-
def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
|
| 187 |
-
return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
|
| 188 |
-
|
| 189 |
-
|
| 190 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
| 191 |
return df[df["Model Name"].isin(llm_query)]
|
| 192 |
|
|
@@ -204,14 +148,6 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
| 204 |
return filtered_df
|
| 205 |
|
| 206 |
|
| 207 |
-
def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 208 |
-
always_here_cols = [
|
| 209 |
-
CostEvalColumn.model.name,
|
| 210 |
-
]
|
| 211 |
-
filtered_df = df[always_here_cols + [c for c in COST_COLS if c in df.columns and c in columns]]
|
| 212 |
-
return filtered_df
|
| 213 |
-
|
| 214 |
-
|
| 215 |
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 216 |
always_here_cols = [
|
| 217 |
TSEvalColumn.model.name,
|
|
@@ -423,87 +359,6 @@ with demo:
|
|
| 423 |
leaderboard_table,
|
| 424 |
queue=True,
|
| 425 |
)
|
| 426 |
-
with gr.TabItem("🏅 Latency & Cost", elem_id="llm-benchmark-tab-table", id=1):
|
| 427 |
-
with gr.Row():
|
| 428 |
-
with gr.Column():
|
| 429 |
-
with gr.Row():
|
| 430 |
-
shown_columns = gr.CheckboxGroup(
|
| 431 |
-
choices=[c.name for c in fields(CostEvalColumn) if not c.hidden and not c.never_hidden],
|
| 432 |
-
value=[
|
| 433 |
-
c.name
|
| 434 |
-
for c in fields(CostEvalColumn)
|
| 435 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 436 |
-
],
|
| 437 |
-
label="Select columns to show",
|
| 438 |
-
elem_id="column-select",
|
| 439 |
-
interactive=True,
|
| 440 |
-
)
|
| 441 |
-
with gr.Row():
|
| 442 |
-
with gr.Column():
|
| 443 |
-
filter_llm = gr.CheckboxGroup(
|
| 444 |
-
choices=list(cost_df["Model Name"].unique()),
|
| 445 |
-
value=list(cost_df["Model Name"].unique()),
|
| 446 |
-
label="Model Name",
|
| 447 |
-
info="",
|
| 448 |
-
interactive=True,
|
| 449 |
-
)
|
| 450 |
-
with gr.Column():
|
| 451 |
-
filter_llm_provider = gr.CheckboxGroup(
|
| 452 |
-
choices=list(cost_df["LLM Provider"].unique()),
|
| 453 |
-
value=list(cost_df["LLM Provider"].unique()),
|
| 454 |
-
label="LLM Provider",
|
| 455 |
-
info="",
|
| 456 |
-
interactive=True,
|
| 457 |
-
)
|
| 458 |
-
with gr.Column():
|
| 459 |
-
filter_use_case_type = gr.CheckboxGroup(
|
| 460 |
-
choices=["Long", "Short"],
|
| 461 |
-
value=["Long", "Short"],
|
| 462 |
-
label="Use Case Flavor",
|
| 463 |
-
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
| 464 |
-
interactive=True,
|
| 465 |
-
)
|
| 466 |
-
|
| 467 |
-
leaderboard_table = gr.components.Dataframe(
|
| 468 |
-
value=init_leaderboard_cost_df(
|
| 469 |
-
leaderboard_cost_df,
|
| 470 |
-
shown_columns.value,
|
| 471 |
-
filter_llm.value,
|
| 472 |
-
filter_llm_provider.value,
|
| 473 |
-
filter_use_case_type.value,
|
| 474 |
-
),
|
| 475 |
-
headers=[c.name for c in fields(CostEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 476 |
-
datatype=COST_TYPES,
|
| 477 |
-
elem_id="leaderboard-table",
|
| 478 |
-
interactive=False,
|
| 479 |
-
visible=True,
|
| 480 |
-
)
|
| 481 |
-
|
| 482 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 483 |
-
value=cost_df[COST_COLS],
|
| 484 |
-
headers=COST_COLS,
|
| 485 |
-
datatype=COST_TYPES,
|
| 486 |
-
visible=False,
|
| 487 |
-
)
|
| 488 |
-
|
| 489 |
-
for selector in [
|
| 490 |
-
shown_columns,
|
| 491 |
-
filter_llm,
|
| 492 |
-
filter_llm_provider,
|
| 493 |
-
filter_use_case_type,
|
| 494 |
-
]:
|
| 495 |
-
selector.change(
|
| 496 |
-
update_cost_table,
|
| 497 |
-
[
|
| 498 |
-
hidden_leaderboard_table_for_search,
|
| 499 |
-
shown_columns,
|
| 500 |
-
filter_llm,
|
| 501 |
-
filter_llm_provider,
|
| 502 |
-
filter_use_case_type,
|
| 503 |
-
],
|
| 504 |
-
leaderboard_table,
|
| 505 |
-
queue=True,
|
| 506 |
-
)
|
| 507 |
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
| 508 |
with gr.Row():
|
| 509 |
with gr.Column():
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from src.display.css_html_js import custom_css
|
| 6 |
+
from src.display.utils import COLS, TS_COLS, TS_TYPES, TYPES, AutoEvalColumn, TSEvalColumn, fields
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from src.envs import CRM_RESULTS_PATH
|
| 8 |
from src.populate import get_leaderboard_df_crm
|
| 9 |
|
| 10 |
+
original_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, TS_COLS)
|
| 11 |
|
|
|
|
| 12 |
leaderboard_df = original_df.copy()
|
|
|
|
| 13 |
leaderboard_ts_df = ts_df.copy()
|
| 14 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
| 15 |
|
|
|
|
| 39 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def update_ts_table(
|
| 43 |
hidden_df: pd.DataFrame,
|
| 44 |
columns: list,
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def init_leaderboard_ts_df(
|
| 94 |
leaderboard_df: pd.DataFrame,
|
| 95 |
columns: list,
|
|
|
|
| 131 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
| 132 |
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
| 135 |
return df[df["Model Name"].isin(llm_query)]
|
| 136 |
|
|
|
|
| 148 |
return filtered_df
|
| 149 |
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 152 |
always_here_cols = [
|
| 153 |
TSEvalColumn.model.name,
|
|
|
|
| 359 |
leaderboard_table,
|
| 360 |
queue=True,
|
| 361 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
| 363 |
with gr.Row():
|
| 364 |
with gr.Column():
|
src/populate.py
CHANGED
|
@@ -6,7 +6,7 @@ from src.display.utils import AutoEvalColumn
|
|
| 6 |
|
| 7 |
|
| 8 |
def get_leaderboard_df_crm(
|
| 9 |
-
crm_results_path: str, accuracy_cols: list,
|
| 10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 11 |
"""Creates a dataframe from all the individual experiment results"""
|
| 12 |
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
|
@@ -30,9 +30,6 @@ def get_leaderboard_df_crm(
|
|
| 30 |
on=["Model Name", "Cost and Speed: Flavor"],
|
| 31 |
)
|
| 32 |
|
| 33 |
-
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 34 |
-
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
| 35 |
-
|
| 36 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
| 37 |
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
| 38 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
|
@@ -64,4 +61,4 @@ def get_leaderboard_df_crm(
|
|
| 64 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 65 |
)
|
| 66 |
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
| 67 |
-
return leaderboard_accuracy_df,
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def get_leaderboard_df_crm(
|
| 9 |
+
crm_results_path: str, accuracy_cols: list, ts_cols: list
|
| 10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 11 |
"""Creates a dataframe from all the individual experiment results"""
|
| 12 |
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
|
|
|
| 30 |
on=["Model Name", "Cost and Speed: Flavor"],
|
| 31 |
)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
| 34 |
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
| 35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
|
|
|
| 61 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 62 |
)
|
| 63 |
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
| 64 |
+
return leaderboard_accuracy_df, leaderboard_ts_df
|