Spaces:
Running
Running
add filter: Use Case Area
Browse files- app.py +98 -30
- crm-results/hf_leaderboard_latency_cost.csv +37 -0
- src/display/utils.py +2 -5
- src/populate.py +2 -0
app.py
CHANGED
|
@@ -34,6 +34,7 @@ def update_table(
|
|
| 34 |
llm_query: list,
|
| 35 |
llm_provider_query: list,
|
| 36 |
accuracy_method_query: str,
|
|
|
|
| 37 |
use_case_query: list,
|
| 38 |
use_case_type_query: list,
|
| 39 |
# type_query: list,
|
|
@@ -49,20 +50,49 @@ def update_table(
|
|
| 49 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
| 50 |
|
| 51 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
| 52 |
-
|
| 53 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 54 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
| 55 |
df = select_columns(filtered_df, columns)
|
| 56 |
return df
|
| 57 |
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
| 60 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
|
| 64 |
-
# print(use_case_query)
|
| 65 |
-
# print(df[df["Use Case Name"].isin(["Service: Conversation summary"])])
|
| 66 |
return df[df["Use Case Name"].isin(use_case_query)]
|
| 67 |
|
| 68 |
|
|
@@ -170,7 +200,33 @@ with demo:
|
|
| 170 |
# )
|
| 171 |
with gr.Row():
|
| 172 |
with gr.Column():
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
choices=["Service", "Sales"],
|
| 175 |
value=["Service", "Sales"],
|
| 176 |
label="Use Case Area",
|
|
@@ -185,15 +241,15 @@ with demo:
|
|
| 185 |
info="",
|
| 186 |
interactive=True,
|
| 187 |
)
|
| 188 |
-
with gr.Column():
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
with gr.Column():
|
| 198 |
filter_metric_area = gr.CheckboxGroup(
|
| 199 |
choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
|
@@ -217,25 +273,35 @@ with demo:
|
|
| 217 |
info="Range: 0.0 to 4.0",
|
| 218 |
interactive=True,
|
| 219 |
)
|
| 220 |
-
with gr.Column():
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
with gr.Column():
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
|
| 237 |
leaderboard_table = gr.components.Dataframe(
|
| 238 |
-
value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 240 |
datatype=TYPES,
|
| 241 |
elem_id="leaderboard-table",
|
|
@@ -268,6 +334,7 @@ with demo:
|
|
| 268 |
filter_llm,
|
| 269 |
filter_llm_provider,
|
| 270 |
filter_accuracy_method,
|
|
|
|
| 271 |
filter_use_case,
|
| 272 |
filter_use_case_type,
|
| 273 |
# filter_columns_type,
|
|
@@ -283,6 +350,7 @@ with demo:
|
|
| 283 |
filter_llm,
|
| 284 |
filter_llm_provider,
|
| 285 |
filter_accuracy_method,
|
|
|
|
| 286 |
filter_use_case,
|
| 287 |
filter_use_case_type,
|
| 288 |
# filter_columns_type,
|
|
|
|
| 34 |
llm_query: list,
|
| 35 |
llm_provider_query: list,
|
| 36 |
accuracy_method_query: str,
|
| 37 |
+
use_case_area_query: list,
|
| 38 |
use_case_query: list,
|
| 39 |
use_case_type_query: list,
|
| 40 |
# type_query: list,
|
|
|
|
| 50 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
| 51 |
|
| 52 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
| 53 |
+
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
| 54 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 55 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
| 56 |
df = select_columns(filtered_df, columns)
|
| 57 |
return df
|
| 58 |
|
| 59 |
|
| 60 |
+
def init_leaderboard_df(
|
| 61 |
+
leaderboard_df: pd.DataFrame,
|
| 62 |
+
columns: list,
|
| 63 |
+
llm_query: list,
|
| 64 |
+
llm_provider_query: list,
|
| 65 |
+
accuracy_method_query: str,
|
| 66 |
+
use_case_area_query: list,
|
| 67 |
+
use_case_query: list,
|
| 68 |
+
use_case_type_query: list,
|
| 69 |
+
):
|
| 70 |
+
return update_table(
|
| 71 |
+
leaderboard_df,
|
| 72 |
+
columns,
|
| 73 |
+
llm_query,
|
| 74 |
+
llm_provider_query,
|
| 75 |
+
accuracy_method_query,
|
| 76 |
+
use_case_area_query,
|
| 77 |
+
use_case_query,
|
| 78 |
+
use_case_type_query,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
| 83 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
| 84 |
|
| 85 |
|
| 86 |
+
def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
|
| 87 |
+
return df[
|
| 88 |
+
df["Use Case Area"].apply(
|
| 89 |
+
lambda x: len(set([_.strip() for _ in x.split("&")]).intersection(use_case_area_query))
|
| 90 |
+
)
|
| 91 |
+
> 0
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
|
|
|
|
|
|
|
| 96 |
return df[df["Use Case Name"].isin(use_case_query)]
|
| 97 |
|
| 98 |
|
|
|
|
| 200 |
# )
|
| 201 |
with gr.Row():
|
| 202 |
with gr.Column():
|
| 203 |
+
filter_llm = gr.CheckboxGroup(
|
| 204 |
+
choices=list(original_df["Model Name"].unique()),
|
| 205 |
+
value=list(original_df["Model Name"].unique()),
|
| 206 |
+
label="Model Name",
|
| 207 |
+
info="",
|
| 208 |
+
interactive=True,
|
| 209 |
+
)
|
| 210 |
+
with gr.Column():
|
| 211 |
+
filter_llm_provider = gr.CheckboxGroup(
|
| 212 |
+
choices=list(original_df["LLM Provider"].unique()),
|
| 213 |
+
value=list(original_df["LLM Provider"].unique()),
|
| 214 |
+
label="LLM Provider",
|
| 215 |
+
info="",
|
| 216 |
+
interactive=True,
|
| 217 |
+
)
|
| 218 |
+
with gr.Row():
|
| 219 |
+
filter_use_case = gr.CheckboxGroup(
|
| 220 |
+
choices=list(original_df["Use Case Name"].unique()),
|
| 221 |
+
value=list(original_df["Use Case Name"].unique()),
|
| 222 |
+
label="Use Case",
|
| 223 |
+
info="",
|
| 224 |
+
# multiselect=True,
|
| 225 |
+
interactive=True,
|
| 226 |
+
)
|
| 227 |
+
with gr.Row():
|
| 228 |
+
with gr.Column():
|
| 229 |
+
filter_use_case_area = gr.CheckboxGroup(
|
| 230 |
choices=["Service", "Sales"],
|
| 231 |
value=["Service", "Sales"],
|
| 232 |
label="Use Case Area",
|
|
|
|
| 241 |
info="",
|
| 242 |
interactive=True,
|
| 243 |
)
|
| 244 |
+
# with gr.Column():
|
| 245 |
+
# filter_use_case = gr.Dropdown(
|
| 246 |
+
# choices=list(original_df["Use Case Name"].unique()),
|
| 247 |
+
# value=list(original_df["Use Case Name"].unique()),
|
| 248 |
+
# label="Use Case",
|
| 249 |
+
# info="",
|
| 250 |
+
# multiselect=True,
|
| 251 |
+
# interactive=True,
|
| 252 |
+
# )
|
| 253 |
with gr.Column():
|
| 254 |
filter_metric_area = gr.CheckboxGroup(
|
| 255 |
choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
|
|
|
| 273 |
info="Range: 0.0 to 4.0",
|
| 274 |
interactive=True,
|
| 275 |
)
|
| 276 |
+
# with gr.Column():
|
| 277 |
+
# filter_llm = gr.CheckboxGroup(
|
| 278 |
+
# choices=list(original_df["Model Name"].unique()),
|
| 279 |
+
# value=list(leaderboard_df["Model Name"].unique()),
|
| 280 |
+
# label="Model Name",
|
| 281 |
+
# info="",
|
| 282 |
+
# interactive=True,
|
| 283 |
+
# )
|
| 284 |
+
# with gr.Column():
|
| 285 |
+
# filter_llm_provider = gr.CheckboxGroup(
|
| 286 |
+
# choices=list(original_df["LLM Provider"].unique()),
|
| 287 |
+
# value=list(leaderboard_df["LLM Provider"].unique()),
|
| 288 |
+
# label="LLM Provider",
|
| 289 |
+
# info="",
|
| 290 |
+
# interactive=True,
|
| 291 |
+
# )
|
| 292 |
|
| 293 |
leaderboard_table = gr.components.Dataframe(
|
| 294 |
+
# value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
| 295 |
+
value=init_leaderboard_df(
|
| 296 |
+
leaderboard_df,
|
| 297 |
+
shown_columns.value,
|
| 298 |
+
filter_llm.value,
|
| 299 |
+
filter_llm_provider.value,
|
| 300 |
+
filter_accuracy_method.value,
|
| 301 |
+
filter_use_case_area.value,
|
| 302 |
+
filter_use_case.value,
|
| 303 |
+
filter_use_case_type.value,
|
| 304 |
+
),
|
| 305 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 306 |
datatype=TYPES,
|
| 307 |
elem_id="leaderboard-table",
|
|
|
|
| 334 |
filter_llm,
|
| 335 |
filter_llm_provider,
|
| 336 |
filter_accuracy_method,
|
| 337 |
+
filter_use_case_area,
|
| 338 |
filter_use_case,
|
| 339 |
filter_use_case_type,
|
| 340 |
# filter_columns_type,
|
|
|
|
| 350 |
filter_llm,
|
| 351 |
filter_llm_provider,
|
| 352 |
filter_accuracy_method,
|
| 353 |
+
filter_use_case_area,
|
| 354 |
filter_use_case,
|
| 355 |
filter_use_case_type,
|
| 356 |
# filter_columns_type,
|
crm-results/hf_leaderboard_latency_cost.csv
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model Name,Use Case Type (Long vs Short),Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band
|
| 2 |
+
AI21 Jamba-Instruct,Long,AI21,4.0,232.9,1.6,High
|
| 3 |
+
AI21 Jamba-Instruct,Short,AI21,4.0,243.9,0.5,High
|
| 4 |
+
Claude 3 Haiku,Long,Bedrock,2.8,236.9,1.0,High
|
| 5 |
+
Claude 3 Haiku,Short,Bedrock,2.2,245.4,0.4,High
|
| 6 |
+
Claude 3 Opus,Long,Bedrock,12.2,242.7,61.1,High
|
| 7 |
+
Claude 3 Opus,Short,Bedrock,8.4,243.2,25.4,High
|
| 8 |
+
Cohere Command R+,Long,Bedrock,7.7,245.7,11.7,High
|
| 9 |
+
Cohere Command R+,Short,Bedrock,7.1,249.9,5.1,High
|
| 10 |
+
Cohere Command Text,Long,Bedrock,12.9,238.7,4.3,High
|
| 11 |
+
Cohere Command Text,Short,Bedrock,9.6,245.6,1.1,High
|
| 12 |
+
Gemini Pro 1.5,Long,Google,5.5,245.7,11.0,High
|
| 13 |
+
Gemini Pro 1.5,Short,Google,5.4,247.5,3.3,High
|
| 14 |
+
Gemini Pro 1,Long,Google,6.0,228.9,1.7,High
|
| 15 |
+
Gemini Pro 1,Short,Google,4.4,247.4,0.6,High
|
| 16 |
+
GPT 3.5 Turbo,Long,OpenAI,4.5,249.9,1.6,High
|
| 17 |
+
GPT 3.5 Turbo,Short,OpenAI,4.2,238.3,0.6,High
|
| 18 |
+
GPT 4 Turbo,Long,OpenAI,12.3,247.6,32.0,High
|
| 19 |
+
GPT 4 Turbo,Short,OpenAI,12.3,250.0,11.7,High
|
| 20 |
+
GPT4-o,Long,OpenAI,5.1,248.4,15.9,High
|
| 21 |
+
GPT4-o,Short,OpenAI,5.0,250.0,5.8,High
|
| 22 |
+
Mistral 7B,Long,Self-host (g5.48xlarge),8.83,242.0,16.5,High
|
| 23 |
+
Mistral 7B,Short,Self-host (g5.48xlarge),8.31,247.0,15.5,High
|
| 24 |
+
LLaMA 3 8B,Long,Self-host (g5.48xlarge),3.76,251.5,7.0,High
|
| 25 |
+
LLaMA 3 8B,Short,Self-host (g5.48xlarge),3.23,243.6,6.0,High
|
| 26 |
+
LLaMA 3 70B,Long,Self-host (p4d.24xlarge),20.1,243.9,67.7,High
|
| 27 |
+
LLaMA 3 70B,Short,Self-host (p4d.24xlarge),29.4,251.2,99.0,High
|
| 28 |
+
Mixtral 8x7B,Long,Self-host (p4d.24xlarge),2.44,248.5,8.22,High
|
| 29 |
+
Mixtral 8x7B,Short,Self-host (p4d.24xlarge),2.41,250.0,8.11,High
|
| 30 |
+
SF-TextBase 7B,Long,Self-host (g5.48xlarge),8.99,248.5,16.80,High
|
| 31 |
+
SF-TextBase 7B,Short,Self-host (g5.48xlarge),8.29,248.7,15.50,High
|
| 32 |
+
SF-TextBase 70B,Long,Self-host (p4de.24xlarge),6.52,253.7,28.17,High
|
| 33 |
+
SF-TextBase 70B,Short,Self-host (p4de.24xlarge),6.24,249.7,26.96,High
|
| 34 |
+
SF-TextSum,Long,Self-host (g5.48xlarge),8.85,244.0,16.55,High
|
| 35 |
+
SF-TextSum,Short,Self-host (g5.48xlarge),8.34,250.4,15.60,High
|
| 36 |
+
XGen 2,Long,Self-host (p4de.24xlarge),3.71,250.0,16.03,High
|
| 37 |
+
XGen 2,Short,Self-host (p4de.24xlarge),2.64,250.0,11.40,High
|
src/display/utils.py
CHANGED
|
@@ -25,14 +25,11 @@ class ColumnContent:
|
|
| 25 |
## Leaderboard columns
|
| 26 |
auto_eval_column_dict = []
|
| 27 |
# Init
|
| 28 |
-
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 29 |
auto_eval_column_dict.append(
|
| 30 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 31 |
)
|
| 32 |
-
auto_eval_column_dict.append(
|
| 33 |
-
|
| 34 |
-
)
|
| 35 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
| 36 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
| 37 |
|
| 38 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
|
|
|
| 25 |
## Leaderboard columns
|
| 26 |
auto_eval_column_dict = []
|
| 27 |
# Init
|
|
|
|
| 28 |
auto_eval_column_dict.append(
|
| 29 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 30 |
)
|
| 31 |
+
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 32 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
|
|
|
|
|
|
|
| 33 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
| 34 |
|
| 35 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
src/populate.py
CHANGED
|
@@ -11,6 +11,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
| 11 |
def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
|
|
|
|
|
|
| 14 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 15 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 16 |
# )
|
|
|
|
| 11 |
def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
| 14 |
+
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
| 15 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
| 16 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 17 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 18 |
# )
|