import json import pandas as pd from statistics import mean from huggingface_hub import HfApi, create_repo from datasets import load_dataset, Dataset from datasets.data_files import EmptyDatasetError from constants import ( REPO_ID, HF_TOKEN, DATASETS, SHORT_DATASET_NAMES, DATASET_DESCRIPTIONS, ) api = HfApi(token=HF_TOKEN) def init_repo(): try: api.repo_info(REPO_ID, repo_type="dataset") except: create_repo(REPO_ID, repo_type="dataset", private=True, token=HF_TOKEN) def load_data(): columns = ( ["model_name", "link", "license", "overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ) try: dataset = load_dataset(REPO_ID, token=HF_TOKEN) df = dataset["train"].to_pandas() except EmptyDatasetError: df = pd.DataFrame(columns=columns) if not df.empty: df = df.sort_values("overall_wer").reset_index(drop=True) df.insert(0, "rank", df.index + 1) df["overall_wer"] = (df["overall_wer"] * 100).round(2).apply(lambda x: f"{x}") df["overall_cer"] = (df["overall_cer"] * 100).round(2).apply(lambda x: f"{x}") for ds in DATASETS: df[f"wer_{ds}"] = (df[f"wer_{ds}"] * 100).round(2) df[f"cer_{ds}"] = (df[f"cer_{ds}"] * 100).round(2) for short_ds, ds in zip(SHORT_DATASET_NAMES, DATASETS): df[short_ds] = df.apply( lambda row: f'{row[f"wer_{ds}"]:.2f}', axis=1, ) df = df.drop(columns=[f"wer_{ds}", f"cer_{ds}"]) df["model_name"] = df.apply( lambda row: f'{row["model_name"]}', axis=1, ) df = df.drop(columns=["link"]) df["license"] = df["license"].apply( lambda x: "Открытая" if any( term in x.lower() for term in ["mit", "apache", "bsd", "gpl", "open"] ) else "Закрытая" ) df.rename( columns={ "overall_wer": "Средний WER ⬇️", "overall_cer": "Средний CER ⬇️", "license": "Тип модели", "model_name": "Модель", "rank": "Ранг", }, inplace=True, ) table_html = df.to_html(escape=False, index=False) return f'
| Ранг | Модель | Тип модели | Средний WER ⬇️ | Средний CER ⬇️ | ' + "".join(f"{short} | " for short in SHORT_DATASET_NAMES) + "
|---|