Spaces:

zixianma
/

TaskMeAnything-UI

Runtime error

App Files Files Community

zixianma commited on May 26, 2024

Commit

95d0904

1 Parent(s): baea0cd

added application file along with data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +1231 -0
db/2d/2d-how-many/embeddings.pkl +3 -0
db/2d/2d-how-many/expanded_data.csv +0 -0
db/2d/2d-how-many/gt.pkl +3 -0
db/2d/2d-how-many/instructblip_vicuna13b_surprise.pkl +3 -0
db/2d/2d-how-many/instructblip_vicuna7b_surprise.pkl +3 -0
db/2d/2d-how-many/llava15_13b_surprise.pkl +3 -0
db/2d/2d-how-many/llava15_7b_surprise.pkl +3 -0
db/2d/2d-how-many/merged_data.csv +0 -0
db/2d/2d-how-many/path.json +0 -0
db/2d/2d-how-many/qa.pkl +3 -0
db/2d/2d-how-many/qwenvl_chat_surprise.pkl +3 -0
db/2d/2d-how-many/qwenvl_surprise.pkl +3 -0
db/2d/2d-how-many/task_plan.pkl +3 -0
db/2d/2d-what-attribute/embeddings.pkl +3 -0
db/2d/2d-what-attribute/expanded_data.csv +0 -0
db/2d/2d-what-attribute/gt.pkl +3 -0
db/2d/2d-what-attribute/instructblip_vicuna13b_surprise.pkl +3 -0
db/2d/2d-what-attribute/instructblip_vicuna7b_surprise.pkl +3 -0
db/2d/2d-what-attribute/llava15_13b_surprise.pkl +3 -0
db/2d/2d-what-attribute/llava15_7b_surprise.pkl +3 -0
db/2d/2d-what-attribute/merged_data.csv +0 -0
db/2d/2d-what-attribute/path.json +0 -0
db/2d/2d-what-attribute/qa.pkl +3 -0
db/2d/2d-what-attribute/qwenvl_chat_surprise.pkl +3 -0
db/2d/2d-what-attribute/qwenvl_surprise.pkl +3 -0
db/2d/2d-what-attribute/task_plan.pkl +3 -0
db/2d/2d-what/embeddings.pkl +3 -0
db/2d/2d-what/expanded_data.csv +0 -0
db/2d/2d-what/gt.pkl +3 -0
db/2d/2d-what/instructblip_vicuna13b_surprise.pkl +3 -0
db/2d/2d-what/instructblip_vicuna7b_surprise.pkl +3 -0
db/2d/2d-what/llava15_13b_surprise.pkl +3 -0
db/2d/2d-what/llava15_7b_surprise.pkl +3 -0
db/2d/2d-what/merged_data.csv +0 -0
db/2d/2d-what/path.json +0 -0
db/2d/2d-what/qa.pkl +3 -0
db/2d/2d-what/qwenvl_chat_surprise.pkl +3 -0
db/2d/2d-what/qwenvl_surprise.pkl +3 -0
db/2d/2d-what/task_plan.pkl +3 -0
db/2d/2d-where-attribute/embeddings.pkl +3 -0
db/2d/2d-where-attribute/expanded_data.csv +0 -0
db/2d/2d-where-attribute/gt.pkl +3 -0
db/2d/2d-where-attribute/instructblip_vicuna13b_surprise.pkl +3 -0
db/2d/2d-where-attribute/instructblip_vicuna7b_surprise.pkl +3 -0
db/2d/2d-where-attribute/llava15_13b_surprise.pkl +3 -0
db/2d/2d-where-attribute/llava15_7b_surprise.pkl +3 -0
db/2d/2d-where-attribute/merged_data.csv +0 -0
db/2d/2d-where-attribute/path.json +0 -0
db/2d/2d-where-attribute/qa.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,1231 @@

+import pandas as pd
+import numpy as np
+import os
+from copy import deepcopy
+import gradio as gr
+import altair as alt
+alt.data_transformers.enable("vegafusion")
+from dynabench.task_evaluator import *
+BASE_DIR = "../db"
+MODELS = ['qwenvl-chat', 'qwenvl', 'llava15-7b', 'llava15-13b', 'instructblip-vicuna13b', 'instructblip-vicuna7b']
+VIDEO_MODELS = ['video-chat2-7b','video-llama2-7b','video-llama2-13b','chat-univi-7b','chat-univi-13b','video-llava-7b','video-chatgpt-7b']
+domains = ["imageqa-2d-sticker", "imageqa-3d-tabletop", "imageqa-scene-graph", "videoqa-3d-tabletop", "videoqa-scene-graph"]
+domain2folder = {"imageqa-2d-sticker": "2d",
+                 "imageqa-3d-tabletop": "3d",
+                 "imageqa-scene-graph": "sg",
+                 "videoqa-3d-tabletop": "video-3d",
+                 "videoqa-scene-graph": "video-sg",
+                 None: '2d'}
+def update_partition_and_models(domain):
+    domain = domain2folder[domain]
+    path = f"{BASE_DIR}/{domain}"
+    if os.path.exists(path):
+        partitions = list_directories(path)
+        if domain.find("video") > -1:
+            model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model")
+        else:
+            model = gr.Dropdown(MODELS, value=MODELS[0], label="model")
+        partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
+        return [partition, model]
+    else:
+        partition = gr.Dropdown([], value=None, label="task space of the following task generator")
+        model = gr.Dropdown([], value=None, label="model")
+        return [partition, model]
+def update_partition_and_models_and_baselines(domain):
+    domain = domain2folder[domain]
+    path = f"{BASE_DIR}/{domain}"
+    if os.path.exists(path):
+        partitions = list_directories(path)
+        if domain.find("video") > -1:
+            model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model")
+            baseline = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="baseline")
+        else:
+            model = gr.Dropdown(MODELS, value=MODELS[0], label="model")
+            baseline = gr.Dropdown(MODELS, value=MODELS[0], label="baseline")
+        partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
+    else:
+        partition = gr.Dropdown([], value=None, label="task space of the following task generator")
+        model = gr.Dropdown([], value=None, label="model")
+        baseline = gr.Dropdown([], value=None, label="baseline")
+    return [partition, model, baseline]
+def get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if not os.path.exists(data_path):
+        return []
+    else:
+        merged_df = pd.read_csv(data_path)
+        merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
+        df = merged_df
+        select_top = rank == "top"
+        # Model X is good / bad at
+        for model in models:
+            if baseline:
+                df = df[df[model] >= df[baseline]]
+            else:
+                if select_top:
+                    df = df[df[model] >= threshold]
+                else:
+                    df = df[df[model] <= threshold]
+        if not baseline:
+            df['mean score'] = df[models].mean(axis=1)
+            df = df.sort_values(by='mean score', ascending=False)
+            df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
+        task_ids = list(df.index)
+    return task_ids
+def plot_patterns(domain, partition, models, rank, k, threshold, baseline, pattern, order):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    if not os.path.exists(data_path):
+        return None
+    task_ids = get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline)
+    expand_df = pd.read_csv(data_path)
+    chart_df = expand_df[expand_df['model'].isin((models + [baseline]) if baseline else models)]
+    chart_df = chart_df[chart_df['task id'].isin(task_ids)]
+    print(pattern)
+    freq, cols = eval(pattern)
+    pattern_str = ""
+    df = chart_df
+    for col in cols:
+        col_name, col_val = col
+        try:
+            col_val = int(col_val)
+        except:
+            col_val = col_val
+        df = df[df[col_name] == col_val]
+        pattern_str += f"{col_name} = {col_val}, "
+        print(len(df))
+    if baseline:
+        model_str = (', '.join(models) if len(models) > 1 else models[0])
+        phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs'
+        title = f"{phrase} better than {baseline} on {freq} tasks where {pattern_str[:-2]}"
+    else:
+        title = f"Models are {'best' if rank == 'top' else 'worst'} at {freq} tasks where {pattern_str[:-2]}"
+    chart = alt.Chart(df).mark_bar().encode(
+            alt.X('model:N',
+                sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
+                axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle),
+            alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+            alt.Color('model:N').legend(),
+        ).properties(
+            width=400,
+            height=300,
+            title=title
+        )
+    return chart
+def plot_embedding(domain, partition, category):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if os.path.exists(data_path):
+        merged_df = pd.read_csv(data_path)
+        # models = merged_df.columns
+        has_image = 'image' in merged_df
+        chart = alt.Chart(merged_df).mark_point(size=30, filled=True).encode(
+            alt.OpacityValue(0.5),
+            alt.X('x:Q'),
+            alt.Y('y:Q'),
+            alt.Color(f'{category}:N'),
+            tooltip=['question', 'answer'] + (['image'] if has_image else []),
+        ).properties(
+            width=800,
+            height=800,
+            title="UMAP Projected Task Embeddings"
+        ).configure_axis(
+            labelFontSize=25,
+            titleFontSize=25,
+        ).configure_title(
+            fontSize=40
+        ).configure_legend(
+            labelFontSize=25,
+            titleFontSize=25,
+        ).interactive()
+        return chart
+    else:
+        return None
+def plot_multi_models(domain, partition, category, cat_options, models, order, pattern, aggregate="mean"):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    if not os.path.exists(data_path):
+        return None
+    expand_df = pd.read_csv(data_path)
+    print(pattern)
+    if pattern is not None:
+        df = expand_df
+        freq, cols = eval(pattern)
+        pattern_str = ""
+        for col in cols:
+            col_name, col_val = col
+            try:
+                col_val = int(col_val)
+            except:
+                col_val = col_val
+            df = df[df[col_name] == col_val]
+            pattern_str += f"{col_name} = {col_val}, "
+        chart = alt.Chart(df).mark_bar().encode(
+            alt.X('model:N',
+                sort=alt.EncodingSortField(field=f'score', order='ascending', op="mean"),
+                axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle),
+            alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+            alt.Color('model:N').legend(),
+        ).properties(
+            width=200,
+            height=100,
+            title=f"How do models perform on tasks where {pattern_str[:-2]} (N={freq})?"
+        )
+        return chart
+    else:
+        df = expand_df[(expand_df['model'].isin(models)) & (expand_df[category].isin(cat_options))]
+        if len(models) > 1:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X('model:N',
+                    sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
+                    axis=alt.Axis(labels=False, tickSize=0, title=None)),
+                alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+                alt.Color('model:N').legend(),
+                alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
+            ).properties(
+                width=200,
+                height=100,
+                title=f"How do models perform across {category}?"
+            )
+        else:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle),
+                alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+                alt.Color(f'{category}:N').legend(None),
+            ).properties(
+                width=200,
+                height=100,
+                title=f"How does {models[0]} perform across {category}?"
+            )
+        chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle')
+        return chart
+def plot(domain, partition, models, rank, k, threshold, baseline, order, category, cat_options):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    expand_data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    # task_plan.reset_index(inplace=True)
+    if not os.path.exists(data_path) or not os.path.exists(expand_data_path):
+        return None
+    else:
+        merged_df = pd.read_csv(data_path)
+        merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
+        expand_df = pd.read_csv(expand_data_path)
+        df = merged_df
+        select_top = rank == "top"
+        # Model X is good / bad at
+        for model in models:
+            if baseline:
+                df = df[df[model] >= df[baseline]]
+            else:
+                if select_top:
+                    df = df[df[model] >= threshold]
+                else:
+                    df = df[df[model] <= threshold]
+        if not baseline:
+            df['mean score'] = df[models].mean(axis=1)
+            df = df.sort_values(by='mean score', ascending=False)
+            df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
+        task_ids = list(df.index)
+        if baseline:
+            models += [baseline]
+        chart_df = expand_df[expand_df['model'].isin(models)]
+        chart_df = chart_df[chart_df['task id'].isin(task_ids)]
+        if cat_options:
+            df = chart_df[chart_df[category].isin(cat_options)]
+        else:
+            df = chart_df
+        if baseline:
+            model_str = (', '.join(models) if len(models) > 1 else models[0])
+            phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs'
+            title = f"Are there any tasks where {phrase} better than {baseline} (by {category})?"
+        else:
+            title = f"What tasks are models {'best' if select_top else 'worst'} at by {category}?"
+        if len(models) > 1:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X('model:N',
+                    sort=alt.EncodingSortField(field=f'score', order=order, op="mean"),
+                    axis=alt.Axis(labels=False, tickSize=0, title=None)),
+                alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+                alt.Color('model:N').legend(),
+                alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
+            ).properties(
+                width=200,
+                height=100,
+                title=title
+            )
+        else:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle),
+                alt.Y('mean(score):Q', scale=alt.Scale(zero=True)),
+                alt.Color(f'{category}:N').legend(None),
+            ).properties(
+                width=200,
+                height=100,
+                title=f"What tasks is model {models[0]} {'best' if select_top else 'worst'} at by {category}?"
+            )
+        chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle')
+        return chart
+def get_frequent_patterns(task_plan, scores):
+    find_frequent_patterns(k=10, df=task_plan, scores=scores)
+def list_directories(path):
+    """List all directories within a given path."""
+    return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
+def update_category(domain, partition):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
+    if os.path.exists(data_path):
+        data = pickle.load(open(data_path, 'rb'))
+        categories = list(data.columns)
+        category = gr.Dropdown(categories+["task id"], value=None, label="task metadata", interactive=True)
+        return category
+    else:
+        return gr.Dropdown([], value=None, label="task metadata")
+def update_category2(domain, partition, existing_category):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
+    if os.path.exists(data_path):
+        data = pickle.load(open(data_path, 'rb'))
+        categories = list(data.columns)
+        if existing_category and existing_category in categories:
+            categories.remove(existing_category)
+        category = gr.Dropdown(categories, value=None, label="Optional: second task metadata", interactive=True)
+        return category
+    else:
+        return gr.Dropdown([], value=None, label="task metadata")
+def update_partition(domain):
+    domain = domain2folder[domain]
+    path = f"{BASE_DIR}/{domain}"
+    if os.path.exists(path):
+        partitions = list_directories(path)
+        return gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator")
+    else:
+        return gr.Dropdown([], value=None, label="task space of the following task generator")
+def update_k(domain, partition, category=None):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if os.path.exists(data_path):
+        data = pd.read_csv(data_path)
+        max_k = len(data[category].unique()) if category and category != "task id" else len(data)
+        mid = max_k // 2
+        return gr.Slider(1, max_k, mid, step=1.0, label="k")
+    else:
+        return gr.Slider(1, 1, 1, step=1.0, label="k")
+# def update_category_values(domain, partition, category):
+#     data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+#     if os.path.exists(data_path) and category is not None:
+#         data = pd.read_csv(data_path)
+#         uni_cats = list(data[category].unique())
+#         return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values")
+#     else:
+#         return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values")
+# def update_category_values(domain, partition, models, rank, k, threshold, baseline, category):
+#     data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+#     if not os.path.exists(data_path):
+#         return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values")
+#     else:
+#         merged_df = pd.read_csv(data_path)
+#         merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True)
+#         df = merged_df
+#         select_top = rank == "top"
+#         # Model X is good / bad at
+#         for model in models:
+#             if baseline:
+#                 df = df[df[model] >= df[baseline]]
+#             else:
+#                 if select_top:
+#                     df = df[df[model] >= threshold]
+#                 else:
+#                     df = df[df[model] <= threshold]
+#         if not baseline:
+#             df['mean score'] = df[models].mean(axis=1)
+#             df = df.sort_values(by='mean score', ascending=False)
+#             df = df.iloc[:k, :] if select_top else df.iloc[-k:, :]
+#         uni_cats = list(df[category].unique())
+#         return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values")
+def update_tasks(domain, partition, find_pattern):
+    domain = domain2folder[domain]
+    if find_pattern == "yes":
+        k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=True)
+        pattern = gr.Dropdown([], value=None, interactive=True, label="pattern")
+        category1 =  gr.Dropdown([], value=None, interactive=False, label="task metadata")
+        return [k1,  pattern, category1]
+    else:
+        k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False)
+        pattern = gr.Dropdown([], value=None, interactive=False, label="pattern")
+        data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+        if os.path.exists(data_path):
+            data = pd.read_csv(data_path)
+            non_columns =  MODELS + ['question', 'answer']
+            categories = [cat for cat in list(data.columns) if cat not in non_columns]
+            category1 = gr.Dropdown(categories, value=categories[0], interactive=True, label="task metadata")
+        else:
+            category1 = gr.Dropdown([], value=None, label="task metadata")
+        return [k1, pattern, category1]
+def update_pattern(domain, partition, k):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/patterns.pkl"
+    if not os.path.exists(data_path):
+        return gr.Dropdown([], value=None, interactive=False, label="pattern")
+    else:
+        results = pickle.load(open(data_path, 'rb'))
+        patterns = results[0]
+        patterns = [str(p) for p in patterns]
+        print(patterns)
+        return gr.Dropdown(patterns[:k], value=None, interactive=True, label="pattern")
+def update_threshold(domain, partition, baseline):
+    domain = domain2folder[domain]
+    print(baseline)
+    if baseline:
+        rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=False)
+        k = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=False)
+        return [rank, k, threshold]
+    else:
+        data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+        if os.path.exists(data_path):
+            data = pd.read_csv(data_path)
+            max_k = len(data)
+            print(max_k)
+            k = gr.Slider(1, max_k, 10, step=1.0, label="k", interactive=True)
+        else:
+            k = gr.Slider(1, 1, 1, step=1.0, label="k")
+        rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=True)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True)
+        return [rank, k, threshold]
+def calc_surprisingness(model, scores, embeddings, k):
+    scores = scores[model].to_numpy()
+    sim = embeddings @ embeddings.T
+    # print("sim values:", sim.shape, sim)
+    indices = np.argsort(-sim)[:, :k]
+    # print("indices:", indices.shape, indices)
+    score_diff = scores[:, None] - scores[indices]
+    # print("score differences:", score_diff.shape, score_diff)
+    sim = sim[np.arange(len(scores))[:, None], indices]
+    # print("top10 sim:", sim.shape, sim)
+    all_surprisingness = score_diff * sim
+    # print("all surprisingness:", all_surprisingness.shape, all_surprisingness)
+    mean_surprisingness = np.mean(score_diff * sim, axis=1)
+    res = {'similarity': sim,
+           'task index': indices,
+           'score difference': score_diff,
+           'all surprisingness': all_surprisingness,
+           'mean surprisingness': mean_surprisingness
+          }
+    return res
+def plot_surprisingness(domain, partition, model, rank, k, num_neighbors):
+    domain = domain2folder[domain]
+    # model = model[0]
+    model_str = model.replace("-", "_")
+    # sp_path = f"{BASE_DIR}/{domain}/{partition}/surprise_data.csv"
+    sp_pkl = f"{BASE_DIR}/{domain}/{partition}/{model_str}_surprise.pkl"
+    merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if os.path.exists(sp_pkl) and os.path.exists(merged_path): # and not os.path.exists(sp_path)
+        # if os.path.exists(sp_path):
+        #     sp_df = pd.read_csv(sp_path)
+        #     # res = calc_surprisingness(model, scores, embeds, num_neighbors)
+        #     # k = 10
+        #     model = 'qwenvl'
+        #     num_neighbors = 10
+        # if os.path.exists(sp_pkl):
+        res = pickle.load(open(sp_pkl, 'rb'))
+        total_num_task = res['task index'].shape[0]
+        all_records = []
+        for i in range(total_num_task):
+            mean_surprisingness = np.mean(res['all surprisingness'][i, :num_neighbors])
+            for j in range(num_neighbors):
+                neighbor_id = res['task index'][i, j]
+                score_diff = res['score difference'][i, j]
+                surprisingness = res['all surprisingness'][i, j]
+                similarity = res['similarity'][i, j]
+                record = {"task id": i,
+                        "neighbor rank": j,
+                        "neighbor id": neighbor_id,
+                        "score difference": score_diff,
+                        "surprisingness": surprisingness,
+                        "mean surprisingness": mean_surprisingness,
+                        "similarity": similarity
+                        }
+                # print(record)
+                all_records.append(record)
+        sp_df = pd.DataFrame.from_records(all_records)
+        sp_df = sp_df.sort_values(by="mean surprisingness", ascending=False)
+        num_rows = k * num_neighbors
+        df = sp_df.iloc[:num_rows, :] if rank == "top" else sp_df.iloc[-num_rows:, :]
+        print(len(df))
+        df['is target'] = df.apply(lambda row: int(row['task id'] == row['neighbor id']), axis=1)
+        merged_df = pd.read_csv(merged_path)
+        for col in merged_df.columns:
+            df[col] = df.apply(lambda row: merged_df.iloc[int(row['neighbor id']), :][col], axis=1)
+        tooltips = ['neighbor id'] + ['image', 'question', 'answer', model]
+        print(df.head())
+        pts = alt.selection_point(encodings=['x'])
+        embeds = alt.Chart(df).mark_point(size=30, filled=True).encode(
+            alt.OpacityValue(0.5),
+            alt.X('x:Q', scale=alt.Scale(zero=False)),
+            alt.Y('y:Q', scale=alt.Scale(zero=False)),
+            alt.Color(f'{model}:Q'), #scale=alt.Scale(domain=[1, 0.5, 0], range=['blue', 'white', 'red'], interpolate='rgb')
+            alt.Size("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=[300, 500])),
+            alt.Shape("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'triangle'])),
+            alt.Order("is target:N"),
+            tooltip=tooltips,
+        ).properties(
+            width=400,
+            height=400,
+            title=f"What are the tasks {model} is surprisingly {'good' if rank == 'top' else 'bad'} at compared to {num_neighbors} similar tasks?"
+        ).transform_filter(
+            pts
+        )
+        bar = alt.Chart(df).mark_bar().encode(
+            alt.Y('mean(mean surprisingness):Q'),
+            alt.X('task id:N', sort=alt.EncodingSortField(field='mean surprisingness', order='descending')),
+            color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey")), #
+        ).add_params(pts).properties(
+            width=400,
+            height=200,
+        )
+        chart = alt.hconcat(
+            bar,
+            embeds
+        ).resolve_legend(
+            color="independent",
+            size="independent"
+        ).configure_title(
+            fontSize=20
+        ).configure_legend(
+            labelFontSize=10,
+            titleFontSize=10,
+        )
+        return chart
+    else:
+        print(sp_pkl, merged_path)
+        return None
+def plot_task_distribution(domain, partition, category):
+    domain = domain2folder[domain]
+    task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", "rb"))
+    task_plan.reset_index(inplace=True)
+    col_name = category
+    task_plan_cnt = task_plan.groupby(col_name)['index'].count().reset_index()
+    task_plan_cnt.rename(columns={'index': 'count'}, inplace=True)
+    task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(task_plan) * 100, 2)
+    task_plan_cnt.head()
+    base = alt.Chart(task_plan_cnt).encode(
+    alt.Theta("count:Q").stack(True),
+    alt.Color(f"{col_name}:N").legend(),
+    tooltip=[col_name, 'count', 'frequency (%)']
+    )
+    pie = base.mark_arc(outerRadius=120)
+    return pie
+def plot_all(domain, partition, models, category1, category2, agg):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    if not os.path.exists(data_path):
+        return None
+    expand_df = pd.read_csv(data_path)
+    chart_df = expand_df[expand_df['model'].isin(models)]
+    if category2:
+        color_val = f'{agg}(score):Q'
+        chart = alt.Chart(chart_df).mark_rect().encode(
+            alt.X(f'{category1}:N', sort=alt.EncodingSortField(field='score', order='ascending', op=agg)),
+            alt.Y(f'{category2}:N', sort=alt.EncodingSortField(field='score', order='descending', op=agg)), # no title, no label angle),
+            alt.Color(color_val),
+            alt.Tooltip('score', aggregate=agg, title=f"{agg} score"),
+        ).properties(
+            width=800,
+            height=200,
+        )
+    else:
+        category = "index" if category1 == "task id" else category1
+        # cat_options = list(chart_df[category].unique())
+        # cat_options = cat_options[:5]
+        y_val = f'{agg}(score):Q'
+        df = chart_df
+        # df = chart_df[chart_df[category].isin(cat_options)]
+        if len(models) > 1:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X('model:N',
+                    sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg),
+                    axis=alt.Axis(labels=False, tickSize=0, title=None)),
+                alt.Y(y_val, scale=alt.Scale(zero=True)),
+                alt.Color('model:N').legend(),
+                alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
+            ).properties(
+                width=200,
+                height=100,
+                title=f"How do models perform across {category}?"
+            )
+        else:
+            chart = alt.Chart(df).mark_bar().encode(
+                alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg)), # no title, no label angle),
+                alt.Y(y_val, scale=alt.Scale(zero=True)),
+                alt.Color(f'{category}:N').legend(None),
+            ).properties(
+                width=200,
+                height=100,
+                title=f"How does {models[0]} perform across {category}?"
+            )
+        chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis(
+            labelFontSize=20,
+            titleFontSize=20,
+        ).configure_legend(
+            labelFontSize=15,
+            titleFontSize=15,
+        )
+    return chart
+def update_widgets(domain, partition, category, query_type):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    if not os.path.exists(data_path):
+        print("here?")
+        return [None] * 11
+    df = pd.read_csv(data_path)
+    max_k = len(df[category].unique()) if category and category != "task id" else len(df)
+    widgets = []
+    if query_type == "top k":
+        # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
+        k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", interactive=True, visible=True)
+        model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)'", multiselect=True, interactive=True, visible=True)
+        # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True)
+        model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
+        direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
+        baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
+        md1 = gr.Markdown(r"<h2>ranked by the </h2>")
+        md2 = gr.Markdown(r"<h2>accuracy</h2>")
+        md3 = gr.Markdown(r"")
+    elif query_type == "threshold":
+        # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=True)
+        # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)'", multiselect=True, interactive=True, visible=True)
+        direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True)
+        # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True)
+        model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
+        k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
+        baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
+        baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
+        md1 = gr.Markdown(r"<h2>where the</h2>")
+        md2 = gr.Markdown(r"<h2>accuracy is</h2>")
+        md3 = gr.Markdown(r"")
+    elif query_type == "model comparison":
+        model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)' accuracy", multiselect=True, interactive=True, visible=True)
+        baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)' accuracy", multiselect=True, interactive=True, visible=True)
+        direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True)
+        model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", interactive=True, visible=True)
+        baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+        # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=False)
+        rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
+        k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
+        md1 = gr.Markdown(r"<h2>where the difference between the </h2>")
+        md2 = gr.Markdown(r"<h2>is </h2>")
+        md3 = gr.Markdown(r"<h2>and the</h2>")
+    elif query_type == "model debugging":
+        model = gr.Dropdown(MODELS, value=MODELS[0], label="model's", multiselect=False, interactive=True, visible=True)
+        # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", visible=False)
+        baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False)
+        direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
+        threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
+        rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False)
+        k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False)
+        model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over models)", visible=False)
+        baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False)
+        md1 = gr.Markdown(r"<h2>where </h2>")
+        md2 = gr.Markdown(r"<h2>mean accuracy is below its overall mean accuracy by one standard deviation</h2>")
+        md3 = gr.Markdown(r"")
+    else:
+        widgets = [None] * 11
+    widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3]
+    return widgets
+def select_tasks(domain, partition, category, query_type, task_agg, models, model_agg, rank, k, direction, threshold, baselines, baseline_agg):
+    domain = domain2folder[domain]
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if not os.path.exists(data_path) or not os.path.exists(merged_path):
+        return gr.DataFrame(None)
+    df = pd.read_csv(data_path)
+    merged_df = pd.read_csv(merged_path)
+    task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", 'rb'))
+    task_plan.reset_index(inplace=True)
+    if not category or category == "task id":
+        category = 'index'
+    if query_type == "top k":
+        df = df[df['model'].isin(models)]
+        df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
+        df = df.groupby([category])['score'].agg(model_agg).reset_index()
+        df = df.sort_values(by='score', ascending=False)
+        if rank == "bottom":
+            df = df.iloc[-k:, :]
+        else:
+            df = df.iloc[:k, :]
+    elif query_type == "threshold":
+        df = df[df['model'].isin(models)]
+        df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
+        df = df.groupby([category])['score'].agg(model_agg).reset_index()
+        if direction == "below":
+            df = df[df['score'] <= threshold]
+        else:
+            df = df[df['score'] >= threshold]
+    elif query_type == "model comparison":
+        # df = merged_df
+        # df.reset_index(inplace=True)
+        # df = df.groupby([category])[[model, baseline]].agg(task_agg).reset_index()
+        # df = df[(df[model] - df[baseline] > threshold)]
+        df_baseline = deepcopy(df)
+        df = df[df['model'].isin(models)]
+        df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
+        df = df.groupby([category])['score'].agg(model_agg).reset_index()
+        model_str = ', '.join(models)
+        exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
+        df = df.sort_values(by=category)
+        df_baseline = df_baseline[df_baseline['model'].isin(baselines)]
+        df_baseline = df_baseline.groupby([category, 'model'])['score'].agg(task_agg).reset_index()
+        df_baseline = df_baseline.groupby([category])['score'].agg(baseline_agg).reset_index()
+        model_str = ', '.join(baselines)
+        baseline_score_id = f'{baseline_agg}({model_str})' if len(baselines) > 1 else model_str
+        df_baseline = df_baseline.sort_values(by=category)
+        df.rename(columns={'score': exp_score_id}, inplace=True)
+        df_baseline.rename(columns={'score': baseline_score_id}, inplace=True)
+        df = pd.merge(df, df_baseline, on=category)
+        df = df[(df[exp_score_id] - df[baseline_score_id] > threshold)]
+    elif query_type == "model debugging":
+        model = models
+        print(models)
+        avg_acc = merged_df[model].mean()
+        std = merged_df[model].std()
+        t = avg_acc - std
+        df = df[df['model'] == model]
+        df = df.groupby(['model', category])['score'].agg(task_agg).reset_index()
+        df = df[df['score'] < t]
+        df['mean'] = round(avg_acc, 4)
+        df['std'] = round(std, 4)
+    print(df.head())
+    if category == 'index':
+        task_attrs = list(df[category])
+        selected_tasks = task_plan[task_plan[category].isin(task_attrs)]
+        if len(selected_tasks) == 0:
+            return gr.DataFrame(None, label="There is no such task.")
+        if query_type == "model comparison" and (models and baselines):
+            # selected_tasks[model] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][model].values[0], axis=1)
+            # selected_tasks[baseline] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline].values[0], axis=1)
+            selected_tasks[exp_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][exp_score_id].values[0], axis=1)
+            selected_tasks[baseline_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline_score_id].values[0], axis=1)
+        else:
+            selected_tasks['score'] = selected_tasks.apply(lambda row: df[df['index'] == row['index']]['score'].values[0], axis=1)
+        print(selected_tasks.head())
+        return gr.DataFrame(selected_tasks, label=f"There are {len(selected_tasks)} (out of {len(task_plan)}) tasks in total.")
+    else:
+        if len(df) == 0:
+            return gr.DataFrame(None, label=f"There is no such {category}.")
+        else:
+            return gr.DataFrame(df, label=f"The total number of such {category} is {len(df)}.")
+def find_patterns(selected_tasks, num_patterns, models, baselines, model_agg, baseline_agg):
+    if len(selected_tasks) == 0:
+        return gr.DataFrame(None)
+    print(selected_tasks.head())
+    if 'score' in selected_tasks:
+        scores = selected_tasks['score']
+    # elif model in selected_tasks:
+    #     scores = selected_tasks[model]
+    else:
+        scores = None
+    print(scores)
+    model_str = ', '.join(models)
+    exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
+    if baselines:
+        baseline_str = ', '.join(baselines)
+        baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str
+    tasks_only = selected_tasks
+    all_score_cols = ['score', exp_score_id]
+    if baselines:
+        all_score_cols += [baseline_score_id]
+    for name in all_score_cols:
+        if name in selected_tasks:
+            tasks_only = tasks_only.drop(name, axis=1)
+    results = find_frequent_patterns(k=num_patterns, df=tasks_only, scores=scores)
+    records = []
+    if scores is not None:
+        patterns, scores = results[0], results[1]
+        for pattern, score in zip(patterns, scores):
+            pattern_str = ""
+            for t in pattern[1]:
+                col_name, col_val = t
+                pattern_str += f"{col_name} = {col_val}, "
+            record = {'pattern': pattern_str[:-2], 'count': pattern[0], 'score': score} #{model}
+            records.append(record)
+    else:
+        patterns = results
+        for pattern in patterns:
+            pattern_str = ""
+            for t in pattern[1]:
+                col_name, col_val = t
+                pattern_str += f"{col_name} = {col_val}, "
+            record = {'pattern': pattern_str[:-2], 'count': pattern[0]}
+            records.append(record)
+    df = pd.DataFrame.from_records(records)
+    return gr.DataFrame(df)
+def visualize_task_distribution(selected_tasks, col_name, model1, model2):
+    if not col_name:
+        return None
+    task_plan_cnt = selected_tasks.groupby(col_name)['index'].count().reset_index()
+    task_plan_cnt.rename(columns={'index': 'count'}, inplace=True)
+    task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(selected_tasks) * 100, 2)
+    print(task_plan_cnt.head())
+    tooltips = [col_name, 'count', 'frequency (%)']
+    base = alt.Chart(task_plan_cnt).encode(
+        alt.Theta("count:Q").stack(True),
+        alt.Color(f"{col_name}:N").legend(),
+        tooltip=tooltips
+    )
+    pie = base.mark_arc(outerRadius=120)
+    return pie
+def plot_performance_for_selected_tasks(domain, partition, df, query_type, models, baselines, select_category, vis_category, task_agg, model_agg, baseline_agg, rank, direction, threshold):
+    domain = domain2folder[domain]
+    task_agg = "mean"
+    data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv"
+    mereged_data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv"
+    if not os.path.exists(data_path) or not os.path.exists(mereged_data_path) or len(df) == 0:
+        return None
+    select_tasks = select_category == "task id" and vis_category
+    if select_tasks: # select tasks
+        y_val = f'{task_agg}(score):Q'
+    else: # select task categories
+        y_val = f'score:Q'
+    if select_category == "task id":
+        select_category = "index"
+    print(df.head())
+    if query_type == "model comparison":
+        # re-format the data for plotting
+        model_str = ', '.join(models)
+        exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str
+        baseline_str = ', '.join(baselines)
+        baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str
+        # other_cols = list(df.columns)
+        # other_cols.remove(select_category)
+        print(exp_score_id, baseline_score_id)
+        df = df.melt(id_vars=[select_category], value_vars=[exp_score_id, baseline_score_id])
+        df.rename(columns={'variable': 'model', 'value': 'score'}, inplace=True)
+        print(df.head())
+        if select_tasks:
+            merged_df = pd.read_csv(mereged_data_path)
+            df[vis_category] = df.apply(lambda row: merged_df[merged_df.index == row['index']][vis_category].values[0], axis=1)
+        num_columns = len(df['model'].unique()) * len(df[f'{vis_category}'].unique())
+        chart = alt.Chart(df).mark_bar().encode(
+            alt.X('model:N',
+                sort=alt.EncodingSortField(field=f'score', order='descending', op=task_agg),
+                axis=alt.Axis(labels=False, tickSize=0, title=None)),
+            alt.Y(y_val, scale=alt.Scale(zero=True), title="accuracy"),
+            alt.Color('model:N').legend(),
+            alt.Column(f'{vis_category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelFontSize=20, titleFontSize=20,))
+        ).properties(
+            width=num_columns * 30,
+            height=200,
+            title=f"How do models perform by {vis_category}?"
+        )
+        print(num_columns * 50)
+    else:
+        if query_type == "model debugging":
+            y_title = "accuracy"
+            plot_title = f"{models} performs worse than its (mean - std) on these {vis_category}s"
+            models = [models]
+        else:
+            model_str = ', '.join(models)
+            y_title = f"{model_agg} accuracy" if len(models) > 0 else "accuracy"
+            suffix = f"on these tasks (by {vis_category})" if select_category == "index" else f"on these {vis_category}s"
+            if query_type == "top k":
+                plot_title = f"The {model_agg} accuracy of {model_str} is the {'highest' if rank == 'top' else 'lowest'} " + suffix
+            elif query_type == "threshold":
+                plot_title = f"The {model_agg} accuracy of {model_str} is {direction} {threshold} " + suffix
+        if select_tasks:
+            expand_df = pd.read_csv(data_path)
+            task_ids = list(df['index'].unique())
+            # all_models = (models + baselines) if baselines else models
+            df = expand_df[(expand_df['model'].isin(models)) & (expand_df['task id'].isin(task_ids))]
+        num_columns = len(df[f'{vis_category}'].unique())
+        chart = alt.Chart(df).mark_bar().encode(
+            alt.X(f'{vis_category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=task_agg), axis=alt.Axis(labelAngle=-20)), # no title, no label angle),
+            alt.Y(y_val, scale=alt.Scale(zero=True), title=y_title),
+            alt.Color(f'{vis_category}:N').legend(None),
+        ).properties(
+            width=num_columns * 30,
+            height=200,
+            title=plot_title
+        )
+    chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis(
+            labelFontSize=20,
+            titleFontSize=20,
+        ).configure_legend(
+            labelFontSize=20,
+            titleFontSize=20,
+            labelLimit=200,
+        )
+    return chart
+def sync_vis_category(domain, partition, category):
+    domain = domain2folder[domain]
+    if category and category != "task id":
+        return [gr.Dropdown([category], value=category, label="by task metadata", interactive=False), gr.Dropdown([category], value=category, label="by task metadata", interactive=False)]
+    else:
+        data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
+        if os.path.exists(data_path):
+            data = pickle.load(open(data_path, 'rb'))
+            categories = list(data.columns)
+            return [gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True), gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True)]
+        else:
+            return [None, None]
+def hide_fpm_and_dist_components(domain, partition, category):
+    domain = domain2folder[domain]
+    print(category)
+    if category and category != "task id":
+        num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", visible=False)
+        btn_pattern = gr.Button(value="Find patterns among tasks", visible=False)
+        table = gr.DataFrame({}, height=250, visible=False)
+        dist_chart = gr.Plot(visible=False)
+        col_name = gr.Dropdown([], value=None, label="by task metadata", visible=False)
+        btn_dist = gr.Button(value="Visualize task distribution", visible=False)
+    else:
+        data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl"
+        if os.path.exists(data_path):
+            data = pickle.load(open(data_path, 'rb'))
+            categories = list(data.columns)
+            col_name = gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True, visible=True)
+        else:
+            col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True, visible=True)
+        num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", interactive=True, visible=True)
+        btn_pattern = gr.Button(value="Find patterns among tasks", interactive=True, visible=True)
+        table = gr.DataFrame({}, height=250, interactive=True, visible=True)
+        dist_chart = gr.Plot(visible=True)
+        btn_dist = gr.Button(value="Visualize task distribution", interactive=True, visible=True)
+    return [num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart]
+# domains = list_directories(BASE_DIR)
+theme = gr.Theme.from_hub('sudeepshouche/minimalist')
+theme.font = [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] # gr.themes.GoogleFont("Source Sans Pro") # [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"]
+theme.text_size = gr.themes.sizes.text_lg
+# theme = theme.set(font=)
+demo = gr.Blocks(theme=theme, title="TaskVerse-UI") #
+with demo:
+    with gr.Row():
+        with gr.Column(scale=1):
+                gr.Markdown(
+                    r""
+                )
+        with gr.Column(scale=1):
+            gr.Markdown(
+                    r"<h1>Welcome to TaskVerse-UI! </h1>"
+                )
+        with gr.Column(scale=1):
+            gr.Markdown(
+                    r""
+                )
+    with gr.Tab("📊 Overview"):
+        gr.Markdown(
+            r"<h2>📊 Visualize the overall task distribution and model performance </h2>"
+        )
+        with gr.Row():
+            domain = gr.Radio(domains, label="scenario", scale=2)
+            partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
+            # domain.change(fn=update_partition, inputs=domain, outputs=partition)
+        gr.Markdown(
+            r"<h2>Overall task metadata distribution</h2>"
+        )
+        with gr.Row():
+            category = gr.Dropdown([], value=None, label="task metadata")
+            partition.change(fn=update_category, inputs=[domain, partition], outputs=category)
+        with gr.Row():
+            output = gr.Plot()
+        with gr.Row():
+            btn = gr.Button(value="Plot")
+            btn.click(plot_task_distribution, [domain, partition, category], output)
+        gr.Markdown(
+            r"<h2>Models' overall performance by task metadata</h2>"
+        )
+        with gr.Row():
+            with gr.Column(scale=2):
+                models = gr.CheckboxGroup(MODELS, label="model(s)", value=MODELS)
+            with gr.Column(scale=1):
+                aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="aggregate models' accuracy by")
+        with gr.Row():
+            # with gr.Column(scale=1):
+                category1 = gr.Dropdown([], value=None, label="task metadata", interactive=True)
+                category2 = gr.Dropdown([], value=None, label="Optional: second task metadata", interactive=True)
+                partition.change(fn=update_category, inputs=[domain, partition], outputs=category1)
+                category1.change(fn=update_category2, inputs=[domain, partition, category1], outputs=category2)
+        domain.change(fn=update_partition_and_models, inputs=domain, outputs=[partition, models])
+        with gr.Row():
+            output = gr.Plot()
+        with gr.Row():
+            btn = gr.Button(value="Plot")
+            btn.click(plot_all, [domain, partition, models, category1, category2, aggregate], output)
+        # gr.Examples(["hello", "bonjour", "merhaba"], input_textbox)
+    with gr.Tab("✨ Embedding"):
+        gr.Markdown(
+            r"<h2>✨ Visualize the tasks' embeddings in the 2D space </h2>"
+        )
+        with gr.Row():
+            domain2 = gr.Radio(domains, label="scenario", scale=2)
+            # domain = gr.Dropdown(domains, value=domains[0], label="scenario")
+            partition2 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
+            category2 = gr.Dropdown([], value=None, label="colored by task metadata", scale=1)
+            domain2.change(fn=update_partition, inputs=domain2, outputs=partition2)
+            partition2.change(fn=update_category, inputs=[domain2, partition2], outputs=category2)
+        with gr.Row():
+            output2 = gr.Plot()
+        with gr.Row():
+            btn = gr.Button(value="Run")
+            btn.click(plot_embedding, [domain2, partition2, category2], output2)
+    with gr.Tab("❓ Query"):
+        gr.Markdown(
+            r"<h2>❓ Find out the answers to your queries by finding and visualizing the relevant tasks and models' performance </h2>"
+        )
+        with gr.Row(equal_height=True):
+            domain = gr.Radio(domains, label="scenario", scale=2)
+            partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
+        with gr.Row():
+            query1 = "top k"
+            query2 = "threshold"
+            query3 = "model debugging"
+            query4 = "model comparison"
+            query_type = gr.Radio([query1, query2, query3, query4], value="top k", label=r"query type")
+        with gr.Row():
+            with gr.Accordion("See more details about the query type"):
+                gr.Markdown(
+                r"<ul><li>Top k: Find the k tasks or task metadata that the model(s) perform the best or worst on</li><li>Threshold: Find the tasks or task metadata where the model(s)' performance is greater or lower than a given threshold t</li><li>Model debugging: Find the tasks or task metadata where a model performs significantly worse than its average performance (by one standard deviation)</li><li>Model comparison: Find the tasks or task metadata where some model(s) perform better or worse than the baseline(s) by a given threshold t</li></ul>"
+            )
+        with gr.Row():
+            gr.Markdown(r"<h2>Help me find the</h2>")
+        with gr.Row(equal_height=True):
+            # with gr.Column(scale=1):
+            rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
+            # with gr.Column(scale=2):
+            k = gr.Slider(1, 10, 5 // 2, step=1.0, label="k", interactive=True, visible=True)
+            # with gr.Column(scale=2):
+            category = gr.Dropdown([], value=None, label="tasks / task metadata", interactive=True)
+        with gr.Row():
+            md1 = gr.Markdown(r"<h2>ranked by the </h2>")
+        with gr.Row(equal_height=True):
+            # with gr.Column(scale=1, min_width=100):
+                # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+            model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1)
+            # with gr.Column(scale=8):
+            model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)", multiselect=True, interactive=True, visible=True, scale=2)
+            # with gr.Column(scale=1, min_width=100):
+            # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1)
+        with gr.Row():
+            md3 = gr.Markdown(r"")
+        with gr.Row(equal_height=True):
+            baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=False, scale=1)
+            baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)'", visible=False, scale=2)
+            # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True)
+            # with gr.Column(scale=1, min_width=50):
+        with gr.Row():
+            md2 = gr.Markdown(r"<h2>accuracy</h2>")
+        with gr.Row():
+            # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", visible=False)
+            direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False)
+            threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False)
+        widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3]
+        partition.change(fn=update_category, inputs=[domain, partition], outputs=category)
+        query_type.change(update_widgets, [domain, partition, category, query_type], widgets)
+        domain.change(fn=update_partition_and_models_and_baselines, inputs=domain, outputs=[partition, model, baseline])
+        with gr.Row():
+            df = gr.DataFrame({}, height=200)
+        btn = gr.Button(value="Find tasks / task metadata")
+        btn.click(select_tasks, [domain, partition, category, query_type, aggregate, model, model_aggregate, rank, k, direction, threshold, baseline, baseline_aggregate], df)
+        with gr.Row():
+            plot = gr.Plot()
+        with gr.Row():
+            col_name2 = gr.Dropdown([], value=None, label="by task metadata", interactive=True)
+            partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name2)
+            btn_plot = gr.Button(value="Plot model performance", interactive=True)
+            btn_plot.click(plot_performance_for_selected_tasks, [domain, partition, df, query_type, model, baseline, category, col_name2, aggregate, model_aggregate, baseline_aggregate, rank, direction, threshold], plot)
+        with gr.Row():
+            dist_chart = gr.Plot()
+        with gr.Row():
+            col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True)
+            partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name)
+            btn_dist = gr.Button(value="Visualize task distribution", interactive=True)
+            btn_dist.click(visualize_task_distribution, [df, col_name, model, baseline], dist_chart)
+        with gr.Row():
+            table = gr.DataFrame({}, height=250)
+        with gr.Row():
+            num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns")
+            btn_pattern = gr.Button(value="Find patterns among tasks")
+            btn_pattern.click(find_patterns, [df, num_patterns, model, baseline], table)
+        category.change(fn=hide_fpm_and_dist_components, inputs=[domain, partition, category], outputs=[num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart])
+        category.change(fn=sync_vis_category, inputs=[domain, partition, category], outputs=[col_name, col_name2])
+        category.change(fn=update_k, inputs=[domain, partition, category], outputs=k)
+    with gr.Tab("😮 Surprisingness"):
+        gr.Markdown(r"<h2>😮 Find out the tasks a model is surprisingly good or bad at compared to similar tasks</h2>")
+        with gr.Row():
+            domain3 = gr.Radio(domains, label="scenario", scale=2)
+            partition3 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1)
+        with gr.Row():
+            model3 = gr.Dropdown(MODELS, value=MODELS[0], label="model", interactive=True, visible=True)
+            k3 = gr.Slider(1, 100, 50, step=1.0, label="number of surprising tasks", interactive=True)
+            num_neighbors = gr.Slider(1, 100, 50, step=1.0, label="number of neighbors", interactive=True)
+            rank3 = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True)
+        domain3.change(fn=update_partition_and_models, inputs=domain3, outputs=[partition3, model3])
+        # partition3.change(fn=update_k, inputs=[domain3, partition3], outputs=k3)
+        with gr.Row():
+            output3 = gr.Plot()
+        with gr.Row():
+            btn = gr.Button(value="Plot")
+            btn.click(plot_surprisingness, [domain3, partition3, model3, rank3, k3, num_neighbors], output3)
+# if __name__ == "__main__":
+demo.launch(share=True)

db/2d/2d-how-many/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35cfc281579e9d837218ac6edd58db0475c4728e526eb7d0f005d95772229692
+size 52955299

db/2d/2d-how-many/expanded_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-how-many/gt.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1afea495ff6a5425049d71163c4607af0942d1d750cc3625ecc310ddec123031
+size 828220

db/2d/2d-how-many/instructblip_vicuna13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e358eb20d0793cef91ee2e78c9257baa270f9ba5d96964b6605651048c7e0c1e
+size 48404804

db/2d/2d-how-many/instructblip_vicuna7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4347576213a57fdbc4a62c450072db4488ff83f922920daf8a5f8e48423be26e
+size 48404804

db/2d/2d-how-many/llava15_13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2cae2611f360876a8e9386cd4cae2491512fece11f3c1c963dafedaa5a7b3d4
+size 48404804

db/2d/2d-how-many/llava15_7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1afdc0e89ee8acbcf31992b66c746b1e56e0f6b6b295f78a63a3e7f624b33e
+size 48404804

db/2d/2d-how-many/merged_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-how-many/path.json ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-how-many/qa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e82b51a17f31e4f053b7c6cdc22a3f9dd437dc22436757a4e553ba8506f9cf22
+size 901939

db/2d/2d-how-many/qwenvl_chat_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75307671d6e0c9f8be5375113acac6b53dcaae4a9e5fd3b0ce2ba00fc76c30da
+size 48404804

db/2d/2d-how-many/qwenvl_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0035392b47d7158672b56f54b13d4f738c161cc8939e88b242e909458c6d3ad2
+size 48404804

db/2d/2d-how-many/task_plan.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deaa99058099c23f6011e923e8abcd4e16b0f941a7172462c606b287a599d0a0
+size 861565

db/2d/2d-what-attribute/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6956526b21a3ee60a05e86f0c2bad028644d03d42f0216a538ce97016f1b699c
+size 39137443

db/2d/2d-what-attribute/expanded_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what-attribute/gt.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b367e0455f641e152fe469a0a7832d8f626a4b9657052342d494ad7a180ff42
+size 612316

db/2d/2d-what-attribute/instructblip_vicuna13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a20ad832ca2a26f5efe5328c1f3e9a88fda8e55b5e26cad3817e1223ec889af
+size 35774420

db/2d/2d-what-attribute/instructblip_vicuna7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0632f15894b374a7e67a8bb6f53fdcf84d344cc310e097adc1f724364eaf2e7
+size 35774420

db/2d/2d-what-attribute/llava15_13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99c5a95c3e2de9bd0371060a6886f5bf1b4553120dccdcf38f790116e0712b76
+size 35774420

db/2d/2d-what-attribute/llava15_7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ee4d8a941b1f3176e74bea7e158141c2102f4403acf6b7aa3723b7f2e462a4
+size 35774420

db/2d/2d-what-attribute/merged_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what-attribute/path.json ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what-attribute/qa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e5c3b625e90eea95aac69a176772422c1fb0b882cdee838a7e6c56fbe17b50a
+size 980447

db/2d/2d-what-attribute/qwenvl_chat_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac6c585bebeb3aaaed8dac2c7c4420d7d2e780a735f57b3e1e5b617f87ea8160
+size 35774420

db/2d/2d-what-attribute/qwenvl_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c3d7d5aec0726c1f51b7c65ce5c21e9beb9fda2598efe29e96c9788a8b60bdd
+size 35774420

db/2d/2d-what-attribute/task_plan.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e56dc9b74b81ec36a32811ddc3b6a85fbcf23992185f801d52fbce40516974cd
+size 783662

db/2d/2d-what/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b089c1028a5d2f63609739d53e1c7630f66e4cba66d39a93bdf427565ef29104
+size 39137443

db/2d/2d-what/expanded_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what/gt.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7fffa0ff67eab9da2ea4dae3a005e1f9b75c10d12e6ca376dd5072297edf2d5
+size 612316

db/2d/2d-what/instructblip_vicuna13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e224cfb927268e4a43d289bc5b2025d1b2c767bfca14e284c32b1612a3e4d452
+size 35774420

db/2d/2d-what/instructblip_vicuna7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fef83cca3ed3fa7fce662b37e6085bb1f3373c91e4306eb2a79e79e807854f7
+size 35774420

db/2d/2d-what/llava15_13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cea6a0cc75170114f6f2725fe71c7fd353038b22d144a1afa2545579cdb5894
+size 35774420

db/2d/2d-what/llava15_7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ec12e11122cc629c502de7b8981a482692d20779704659feda4250f01bd5e9d
+size 35774420

db/2d/2d-what/merged_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what/path.json ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-what/qa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:907fb5a1fe2442e5e4c5837020633d970911f187043e0fc481cf9c06940ab643
+size 807018

db/2d/2d-what/qwenvl_chat_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad629744d9f58f163e74cf6e7126617c331681aae15700f5279df8949155b881
+size 35774420

db/2d/2d-what/qwenvl_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bf48191df156d0618c524296077992fd75417fa2d7b03dbcc1628319d3989a5
+size 35774420

db/2d/2d-what/task_plan.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3113b3b2adbb2bb8088ca4ae86d654da511f783ddbf1ce98fff86cd864336225
+size 783652

db/2d/2d-where-attribute/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8b38f46605548c2fc198a3b3d751a999aa28495a95a52e22a29b859f6a820e7
+size 39137443

db/2d/2d-where-attribute/expanded_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-where-attribute/gt.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22fe402eebadfb265f0186a2aee934da6d8144daa10a52173cf583ff4b0acec5
+size 612316

db/2d/2d-where-attribute/instructblip_vicuna13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48ad5886d1c0f90903c8857883b4b589f147f89e1aaa2bb68cff3e1d5707d8ce
+size 35774420

db/2d/2d-where-attribute/instructblip_vicuna7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:855e2037394af1adfe6d30906aeb038a2672d3d1ee8773c98844a63bc6aa66f1
+size 35774420

db/2d/2d-where-attribute/llava15_13b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a438d2b5617d58e08e9321b6a9c0907c08fdd2165103617331ebb3d288410d7
+size 35774420

db/2d/2d-where-attribute/llava15_7b_surprise.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93f420d01d0c87147c218eb76277c78d220acab084cf91443704cd5fb25daaf3
+size 35774420

db/2d/2d-where-attribute/merged_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-where-attribute/path.json ADDED Viewed

The diff for this file is too large to render. See raw diff

db/2d/2d-where-attribute/qa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e45587db39e2316974b4119d2825a27dc8284b46e1d7da643241c5c100ce879
+size 601802