Spaces:
Running
Running
| import json | |
| import os | |
| import pandas as pd | |
| from git import Repo | |
| from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname | |
| class ModelHandler: | |
| def __init__(self): | |
| self.model_infos = {} | |
| def get_folders(dir_path): | |
| return sorted([ | |
| path_ | |
| for path_ in os.listdir(dir_path) | |
| if os.path.isdir(os.path.join(dir_path, path_)) | |
| ]) | |
| def get_vidore_data(self, metric="ndcg_at_5"): | |
| repo_url = "https://github.com/embeddings-benchmark/results.git" | |
| local_path = "./results" | |
| folder_of_interest = "results" | |
| if os.path.exists(local_path): | |
| repo = Repo(local_path) | |
| origin = repo.remotes.origin | |
| origin.pull() | |
| else: | |
| Repo.clone_from(repo_url, local_path, depth=1) | |
| model_names = self.get_folders(os.path.join(local_path, folder_of_interest)) | |
| for model_name in model_names: | |
| revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name)) | |
| first_revision = revisions[0] | |
| result_filenames = [ | |
| result_filename | |
| for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision)) | |
| # if result_filename.endswith(".json") and result_filename != "model_meta.json" | |
| ] | |
| if "model_meta.json" in result_filenames: | |
| with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f: | |
| meta = json.load(f) | |
| else: | |
| meta = {} | |
| results = {} | |
| if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES): | |
| for v1_dataset_name in VIDORE_V1_MTEB_NAMES: | |
| with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"), "r") as f: | |
| results[v1_dataset_name] = json.load(f) | |
| if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES): | |
| for v2_dataset_name in VIDORE_V2_MTEB_NAMES: | |
| with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"), "r") as f: | |
| results[v2_dataset_name] = json.load(f) | |
| if model_name not in self.model_infos: | |
| self.model_infos[model_name] = {} | |
| self.model_infos[model_name] = {"meta": meta, "results": results} | |
| def filter_models_by_benchmark(self, benchmark_version=1): | |
| filtered_model_infos = {} | |
| keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES | |
| for model, info in self.model_infos.items(): | |
| results = info["results"] | |
| if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()): | |
| filtered_model_infos[model] = info | |
| return filtered_model_infos | |
| def render_df(self, metric="ndcg_at_5", benchmark_version=1): | |
| model_res = {} | |
| filtered_model_infos = self.filter_models_by_benchmark(benchmark_version) | |
| if len(filtered_model_infos) > 0: | |
| for model in filtered_model_infos.keys(): | |
| res = filtered_model_infos[model]["results"] | |
| dataset_res = {} | |
| keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES | |
| if "n_parameters" in filtered_model_infos[model]["meta"]: | |
| try: | |
| dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000 | |
| except TypeError: | |
| dataset_res["Model Size (Million Parameters)"] = -1 | |
| else: | |
| dataset_res["Model Size (Million Parameters)"] = -1 | |
| for dataset in res.keys(): | |
| if not any(keyword in dataset for keyword in keywords): | |
| continue | |
| dataset_nickname = get_datasets_nickname(dataset) | |
| dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric] | |
| model_res[model] = dataset_res | |
| df = pd.DataFrame(model_res).T | |
| return df | |
| return pd.DataFrame() | |