Spaces:

vidore
/

vidore-leaderboard

Running

App Files Files Community

Add ViDoRe V3 (MTEB embed)

#14

by antoineedy - opened 26 days ago

base: refs/heads/main

←

from: refs/pr/14

Discussion Files changed

+144

-112

Files changed (5) hide show

app.py +132 -104
app/utils.py +0 -1
data/dataset_handler.py +3 -3
data/deprecated_model_handler.py +5 -1
data/model_handler.py +4 -3

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-from app.utils import add_rank_and_format, filter_models, get_refresh_function, deprecated_get_refresh_function
 from data.deprecated_model_handler import DeprecatedModelHandler
 from data.model_handler import ModelHandler
@@ -80,98 +81,31 @@ def main():
     with gr.Blocks(css=css) as block:
         with gr.Tabs():
-            with gr.TabItem("ViDoRe V1"):
-                gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
-                gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
                 gr.Markdown(
                     """
-                Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
-                Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
                 """
                 )
-                datasets_columns_1 = list(data_benchmark_1.columns[4:])
-                with gr.Row():
-                    metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
-                    research_textbox_1 = gr.Textbox(
-                        placeholder="🔍 Search Models... [press enter]",
-                        label="Filter Models by Name",
-                    )
-                    column_checkboxes_1 = gr.CheckboxGroup(
-                        choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
-                    )
-                with gr.Row():
-                    datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
-                    dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
-                def update_data_1(metric, search_term, selected_columns):
-                    model_handler.get_vidore_data(metric)
-                    data = model_handler.render_df(metric, benchmark_version=1)
-                    data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
-                    data = filter_models(data, search_term)
-                    if selected_columns:
-                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
-                    return data
-                with gr.Row():
-                    refresh_button_1 = gr.Button("Refresh")
-                    refresh_button_1.click(
-                        get_refresh_function(model_handler, benchmark_version=1),
-                        inputs=[metric_dropdown_1],
-                        outputs=dataframe_1,
-                        concurrency_limit=20,
-                    )
-                # Automatically refresh the dataframe when the dropdown value changes
-                metric_dropdown_1.change(
-                    get_refresh_function(model_handler, benchmark_version=1),
-                    inputs=[metric_dropdown_1],
-                    outputs=dataframe_1,
-                )
-                research_textbox_1.submit(
-                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
-                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
-                    outputs=dataframe_1,
-                )
-                column_checkboxes_1.change(
-                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
-                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
-                    outputs=dataframe_1,
-                )
-                gr.Markdown(
-                    f"""
-                - **Total Datasets**: {num_datasets_1}
-                - **Total Scores**: {num_scores_1}
-                - **Total Models**: {num_models_1}
-                """
-                    + r"""
-                Please consider citing:
-                ```bibtex
-                @misc{faysse2024colpaliefficientdocumentretrieval,
-                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
-                  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
-                  year={2024},
-                  eprint={2407.01449},
-                  archivePrefix={arXiv},
-                  primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2407.01449},
-                }
-                @misc{macé2025vidorebenchmarkv2raising,
-                  title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
-                  author={Quentin Macé and António Loison and Manuel Faysse},
-                  year={2025},
-                  eprint={2505.17166},
-                  archivePrefix={arXiv},
-                  primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2505.17166},
-                }
-                ```
                 """
                 )
             with gr.TabItem("ViDoRe V2"):
@@ -180,7 +114,7 @@ def main():
                 gr.Markdown(
                     """
-                Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
                 """
@@ -223,7 +157,7 @@ def main():
                 with gr.Row():
                     gr.Markdown(
                         """
-                    **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
                     Those numbers are not numbers obtained from the organisations that released those models.
                     """
                     )
@@ -256,24 +190,118 @@ def main():
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
-                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
-                      title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                       author={Quentin Macé and António Loison and Manuel Faysse},
                       year={2025},
                       eprint={2505.17166},
                       archivePrefix={arXiv},
                       primaryClass={cs.IR},
-                      url={https://arxiv.org/abs/2505.17166},
                 }
                 ```
                 """
                 )
@@ -310,7 +338,7 @@ def main():
                 gr.Markdown(
                     """
-                Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
                 """
@@ -378,23 +406,23 @@ def main():
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
-                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
-                  title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                   author={Quentin Macé and António Loison and Manuel Faysse},
                   year={2025},
                   eprint={2505.17166},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2505.17166},
                 }
                 ```
                 """
@@ -413,7 +441,7 @@ def main():
                 gr.Markdown(
                     """
-                Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
                 """
@@ -456,7 +484,7 @@ def main():
                 with gr.Row():
                     gr.Markdown(
                         """
-                    **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
                     Those numbers are not numbers obtained from the organisations that released those models.
                     """
                     )
@@ -489,23 +517,23 @@ def main():
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
-                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
-                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
-                      title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                       author={Quentin Macé and António Loison and Manuel Faysse},
                       year={2025},
                       eprint={2505.17166},
                       archivePrefix={arXiv},
                       primaryClass={cs.IR},
-                      url={https://arxiv.org/abs/2505.17166},
                 }
                 ```
                 """

 import gradio as gr
+from app.utils import (add_rank_and_format, deprecated_get_refresh_function,
+                       filter_models, get_refresh_function)
 from data.deprecated_model_handler import DeprecatedModelHandler
 from data.model_handler import ModelHandler
     with gr.Blocks(css=css) as block:
         with gr.Tabs():
+            with gr.TabItem("ViDoRe V3"):
+                # embed in hmtl https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)
+                gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
                 gr.Markdown(
                     """
+                Visual Document Retrieval Benchmark 3 leaderboard. To submit results, refer to the corresponding tab.
+                Refer to:
+                - 🤗 The [blogpost](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3) for all the details on the datasets,
+                - 🤗 The [dataset collection](https://huggingface.co/collections/vidore/vidore-benchmark-v3),
+                - 📝 The [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics.
                 """
                 )
+                gr.Markdown("""
+                As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
+                we embed it here.
+                            """ )
+                gr.HTML(
+                    """
+                <iframe
+                    src="https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)"
+                    style="width:100%; height:1000px; border:2px solid black; border-radius:10px;"
+                ></iframe>
                 """
                 )
             with gr.TabItem("ViDoRe V2"):
                 gr.Markdown(
                     """
+                Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
                 """
                 with gr.Row():
                     gr.Markdown(
                         """
+                    **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
                     Those numbers are not numbers obtained from the organisations that released those models.
                     """
                     )
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
+                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
+                      title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                       author={Quentin Macé and António Loison and Manuel Faysse},
                       year={2025},
                       eprint={2505.17166},
                       archivePrefix={arXiv},
                       primaryClass={cs.IR},
+                      url={https://arxiv.org/abs/2505.17166},
                 }
+                ```
+                """
+                )
+            with gr.TabItem("ViDoRe V1"):
+                gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
+                gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
+                gr.Markdown(
+                    """
+                Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
+                Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
+                """
+                )
+                datasets_columns_1 = list(data_benchmark_1.columns[4:])
+                with gr.Row():
+                    metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
+                    research_textbox_1 = gr.Textbox(
+                        placeholder="🔍 Search Models... [press enter]",
+                        label="Filter Models by Name",
+                    )
+                    column_checkboxes_1 = gr.CheckboxGroup(
+                        choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
+                    )
+                with gr.Row():
+                    datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
+                    dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
+                def update_data_1(metric, search_term, selected_columns):
+                    model_handler.get_vidore_data(metric)
+                    data = model_handler.render_df(metric, benchmark_version=1)
+                    data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
+                    data = filter_models(data, search_term)
+                    if selected_columns:
+                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
+                    return data
+                with gr.Row():
+                    refresh_button_1 = gr.Button("Refresh")
+                    refresh_button_1.click(
+                        get_refresh_function(model_handler, benchmark_version=1),
+                        inputs=[metric_dropdown_1],
+                        outputs=dataframe_1,
+                        concurrency_limit=20,
+                    )
+                # Automatically refresh the dataframe when the dropdown value changes
+                metric_dropdown_1.change(
+                    get_refresh_function(model_handler, benchmark_version=1),
+                    inputs=[metric_dropdown_1],
+                    outputs=dataframe_1,
+                )
+                research_textbox_1.submit(
+                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
+                    outputs=dataframe_1,
+                )
+                column_checkboxes_1.change(
+                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
+                    outputs=dataframe_1,
+                )
+                gr.Markdown(
+                    f"""
+                - **Total Datasets**: {num_datasets_1}
+                - **Total Scores**: {num_scores_1}
+                - **Total Models**: {num_models_1}
+                """
+                    + r"""
+                Please consider citing:
+                ```bibtex
+                @misc{faysse2024colpaliefficientdocumentretrieval,
+                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
+                  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
+                  year={2024},
+                  eprint={2407.01449},
+                  archivePrefix={arXiv},
+                  primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2407.01449},
+                }
+                @misc{macé2025vidorebenchmarkv2raising,
+                  title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
+                  author={Quentin Macé and António Loison and Manuel Faysse},
+                  year={2025},
+                  eprint={2505.17166},
+                  archivePrefix={arXiv},
+                  primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2505.17166},
+                }
                 ```
                 """
                 )
                 gr.Markdown(
                     """
+                Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
                 """
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
+                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
+                  title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                   author={Quentin Macé and António Loison and Manuel Faysse},
                   year={2025},
                   eprint={2505.17166},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2505.17166},
                 }
                 ```
                 """
                 gr.Markdown(
                     """
+                Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
                 Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
                 """
                 with gr.Row():
                     gr.Markdown(
                         """
+                    **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
                     Those numbers are not numbers obtained from the organisations that released those models.
                     """
                     )
                 ```bibtex
                 @misc{faysse2024colpaliefficientdocumentretrieval,
+                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
                   author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
                   year={2024},
                   eprint={2407.01449},
                   archivePrefix={arXiv},
                   primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2407.01449},
                 }
                 @misc{macé2025vidorebenchmarkv2raising,
+                      title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
                       author={Quentin Macé and António Loison and Manuel Faysse},
                       year={2025},
                       eprint={2505.17166},
                       archivePrefix={arXiv},
                       primaryClass={cs.IR},
+                      url={https://arxiv.org/abs/2505.17166},
                 }
                 ```
                 """

app/utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from data.deprecated_model_handler import DeprecatedModelHandler
 def make_clickable_model(model_name, link=None):



1
2
3	def make_clickable_model(model_name, link=None):

data/dataset_handler.py CHANGED Viewed

@@ -108,7 +108,7 @@ def deprecated_get_datasets_nickname(dataset_name) -> str:
         return "ESG Restaurant Human"
     elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
-        "esg_reports" in dataset_name and not "_eng_" in dataset_name
     ):
         return "ESG Restaurant Synthetic Multilingual"
@@ -116,7 +116,7 @@ def deprecated_get_datasets_nickname(dataset_name) -> str:
         return "ESG Restaurant Synthetic"
     elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
-        "biomedical_lectures" in dataset_name and not "_eng_" in dataset_name
     ):
         return "MIT Biomedical Multilingual"
@@ -124,7 +124,7 @@ def deprecated_get_datasets_nickname(dataset_name) -> str:
         return "MIT Biomedical"
     elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
-        "economics_reports" in dataset_name and not "_eng_" in dataset_name
     ):
         return "Economics Macro Multilingual"

         return "ESG Restaurant Human"
     elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
+        "esg_reports" in dataset_name and "_eng_" not in dataset_name
     ):
         return "ESG Restaurant Synthetic Multilingual"
         return "ESG Restaurant Synthetic"
     elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
+        "biomedical_lectures" in dataset_name and "_eng_" not in dataset_name
     ):
         return "MIT Biomedical Multilingual"
         return "MIT Biomedical"
     elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
+        "economics_reports" in dataset_name and "_eng_" not in dataset_name
     ):
         return "Economics Macro Multilingual"

data/deprecated_model_handler.py CHANGED Viewed

@@ -5,7 +5,11 @@ from typing import Any, Dict
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, metadata_load
-from .dataset_handler import DEPRECATED_VIDORE_2_DATASETS_KEYWORDS, DEPRECATED_VIDORE_DATASETS_KEYWORDS, deprecated_get_datasets_nickname
 BLOCKLIST = ["impactframes"]

 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, metadata_load
+from .dataset_handler import (
+    DEPRECATED_VIDORE_2_DATASETS_KEYWORDS,
+    DEPRECATED_VIDORE_DATASETS_KEYWORDS,
+    deprecated_get_datasets_nickname,
+)
 BLOCKLIST = ["impactframes"]

data/model_handler.py CHANGED Viewed

@@ -1,11 +1,12 @@
-from git import Repo
-import shutil
-import os
 import json
 import pandas as pd
 from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
 class ModelHandler:
     def __init__(self):

 import json
+import os
 import pandas as pd
+from git import Repo
 from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
 class ModelHandler:
     def __init__(self):