Spaces:

CultriX
/

Tiny-LeaderBoard

Sleeping

App Files Files Community

CultriX commited on Dec 23, 2024

Commit

90cb3d2

verified ·

1 Parent(s): 7b7eb30

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -200

app.py CHANGED Viewed

@@ -1,208 +1,125 @@
-import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
-import seaborn as sns
-import plotly.graph_objs as go
-import plotly.io as pio
-from io import StringIO
-import base64
-# Read the data from the file
-def parse_data(file_content):
-    lines = file_content.splitlines()
-    model_data = []
-    current_model = None
-    for line in lines:
-        line = line.strip()
-        if line.startswith('hf (pretrained='):
-            current_model = line.split('pretrained=')[1].split(',')[0]
-        elif line and current_model:
-            if '|' in line:
-                # Parse table row
-                parts = [p.strip() for p in line.split('|')]
-                if len(parts) >= 2:  # Ensure the correct number of columns
-                    try:
-                        task_name = parts[0]
-                        value = float(parts[1])  # Extract the numeric value
-                        model_data.append([
-                            current_model,
-                            task_name,  # Task name
-                            value
-                        ])
-                    except ValueError:
-                        print(f"Skipping row due to invalid value: {parts}")
-    if not model_data:
-        print("No valid data found in the file.")
-    return pd.DataFrame(model_data, columns=['Model', 'Task', 'Value'])
-# Calculate average performance
-def calculate_averages(data):
-    if data.empty:
-        print("No data available to calculate averages.")
-        return pd.DataFrame(columns=['Model', 'Average Performance'])
-    return data.groupby('Model')['Value'].mean().reset_index().rename(columns={'Value': 'Average Performance'})
-def create_bar_chart(df, category):
-    """Create a horizontal bar chart for the specified category."""
-    sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
-    fig = go.Figure(go.Bar(
-        x=sorted_df[category],
-        y=sorted_df['Model'],
-        orientation='h',
-        marker=dict(color=sorted_df[category], colorscale='Viridis'),
-        hoverinfo='x+y',
-        text=sorted_df[category],
-        textposition='auto'
-    ))
-    fig.update_layout(
-        margin=dict(l=20, r=20, t=20, b=20),
-        title=f"Leaderboard for {category} Scores"
-    )
-    return fig
-def generate_visualizations(data, averages):
-    sns.set(style='whitegrid')
-    if averages.empty:
-        print("No averages to visualize.")
-        return None, None, None, None, None, None
-    averages = averages.sort_values(by='Average Performance')
-    # Matplotlib average performance plot
     plt.figure(figsize=(12, 8))
-    sns.barplot(data=averages, x='Average Performance', y='Model', palette='viridis')
-    plt.title('Average Performance of Models', fontsize=16)
-    plt.xlabel('Average Performance', fontsize=12)
-    plt.ylabel('Model', fontsize=12)
     plt.tight_layout()
-    # Save the plot to a buffer
-    buffer_avg = StringIO()
-    plt.savefig(buffer_avg, format='png')
-    buffer_avg.seek(0)
-    image_avg = base64.b64encode(buffer_avg.read()).decode('utf-8')
-    plt.close()
-    # Line plot for task performance by model
-    sorted_models = averages['Model'].tolist()
-    data['Model'] = pd.Categorical(data['Model'], categories=sorted_models, ordered=True)
-    data = data.sort_values(by=['Model', 'Task'])
-    if data.empty:
-        print("No data available for line plot.")
-        return image_avg, None, None, None, None, None
     plt.figure(figsize=(14, 10))
-    sns.lineplot(data=data, x='Task', y='Value', hue='Model', marker='o')
-    plt.title('Task Performance by Model', fontsize=16)
-    plt.xlabel('Task', fontsize=12)
-    plt.ylabel('Performance', fontsize=12)
-    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')
     plt.xticks(rotation=45)
     plt.tight_layout()
-    # Save the line plot to a buffer
-    buffer_line = StringIO()
-    plt.savefig(buffer_line, format='png')
-    buffer_line.seek(0)
-    image_line = base64.b64encode(buffer_line.read()).decode('utf-8')
-    plt.close()
-    # Heatmap of task performance
-    pivot_table = data.pivot_table(index='Task', columns='Model', values='Value')
-    plt.figure(figsize=(12, 10))
-    sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
-    plt.title('Task Performance Heatmap', fontsize=16)
-    plt.xlabel('Model', fontsize=12)
-    plt.ylabel('Task', fontsize=12)
-    plt.tight_layout()
-    # Save the heatmap to a buffer
-    buffer_heatmap = StringIO()
-    plt.savefig(buffer_heatmap, format='png')
-    buffer_heatmap.seek(0)
-    image_heatmap = base64.b64encode(buffer_heatmap.read()).decode('utf-8')
-    plt.close()
-    # Boxplot of performance distribution per model
-    plt.figure(figsize=(12, 8))
-    sns.boxplot(data=data, x='Model', y='Value', palette='Set2')
-    plt.title('Performance Distribution per Model', fontsize=16)
-    plt.xlabel('Model', fontsize=12)
-    plt.ylabel('Performance', fontsize=12)
-    plt.xticks(rotation=45)
     plt.tight_layout()
-    # Save the boxplot to a buffer
-    buffer_boxplot = StringIO()
-    plt.savefig(buffer_boxplot, format='png')
-    buffer_boxplot.seek(0)
-    image_boxplot = base64.b64encode(buffer_boxplot.read()).decode('utf-8')
-    plt.close()
-    # Create plotly bar charts
-    fig1 = create_bar_chart(averages, 'Average Performance')
-    plotly_avg = pio.to_html(fig1, full_html=False)
-    plotly_tasks = {}
-    # Assuming you have tasks in the dataframe and want to display it
-    tasks = data['Task'].unique()
-    for task in tasks:
-        task_data = data[data['Task'] == task]
-        fig2 = create_bar_chart(task_data, 'Value')
-        fig2.update_layout(title=f"Leaderboard for {task} Scores")
-        plotly_tasks[task] = pio.to_html(fig2, full_html=False)
-    return image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
-def process_and_visualize(file_content):
-    data = parse_data(file_content)
-    averages = calculate_averages(data)
-    image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks = generate_visualizations(data, averages)
-    output_text = f"Average Performance per Model:\n{averages.sort_values(by='Average Performance').to_string()}"
-    return output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
-if __name__ == "__main__":
-    task_names = ['tinyArc', 'tinyHellaswag', 'tinyMMLU', 'tinyTruthfulQA', 'tinyTruthfulQA_mc1', 'tinyWinogrande']
-    with gr.Blocks(title="LLM Benchmark Visualizer") as demo:
-        gr.Markdown("Upload your LLM benchmark data and visualize the results.")
-        with gr.Row():
-           input_text = gr.Textbox(lines=10, label="Paste your data here")
-        with gr.Row():
-             output_text = gr.Textbox(label="Average Performance per Model")
-        with gr.Row():
-            with gr.Column():
-                image_avg = gr.Image(label="Matplotlib Average Performance Chart")
-                image_line = gr.Image(label="Matplotlib Task Performance Line Chart")
-            with gr.Column():
-                 image_heatmap = gr.Image(label="Matplotlib Task Performance Heatmap")
-                 image_boxplot = gr.Image(label="Matplotlib Performance Distribution Boxplot")
-        with gr.Row():
-              plotly_avg = gr.HTML(label="Plotly Average Performance Chart")
-        task_tabs = gr.TabbedInterface([])
-        def update_tabs(file_content):
-            _, _, _, _, _, _, plotly_tasks = process_and_visualize(file_content)
-            return [gr.HTML(value=html, label=task) for task, html in plotly_tasks.items()]
-        input_text.change(
-            fn=process_and_visualize,
-            inputs=input_text,
-            outputs=[output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg],
-        )
-        input_text.change(fn=update_tabs, inputs=input_text, outputs=[task_tabs])
-    demo.launch(share=True)

 import pandas as pd
 import matplotlib.pyplot as plt
+import gradio as gr
+# Input data
+data_full = [
+    ["CultriX/Qwen2.5-14B-SLERPv7", 0.7205, 0.8272, 0.7541, 0.6581, 0.5000, 0.7290],
+    ["djuna/Q2.5-Veltha-14B-0.5", 0.7492, 0.8386, 0.7305, 0.5980, 0.4300, 0.7817],
+    ["CultriX/Qwen2.5-14B-FinalMerge", 0.7248, 0.8277, 0.7113, 0.7052, 0.5700, 0.7001],
+    ["CultriX/Qwen2.5-14B-MultiCultyv2", 0.7295, 0.8359, 0.7363, 0.5767, 0.4400, 0.7316],
+    ["CultriX/Qwen2.5-14B-Brocav7", 0.7445, 0.8353, 0.7508, 0.6292, 0.4600, 0.7629],
+    ["CultriX/Qwen2.5-14B-Broca", 0.7456, 0.8352, 0.7480, 0.6034, 0.4400, 0.7716],
+    ["CultriX/Qwen2.5-14B-Brocav3", 0.7395, 0.8388, 0.7393, 0.6405, 0.4700, 0.7659],
+    ["CultriX/Qwen2.5-14B-Brocav4", 0.7432, 0.8377, 0.7444, 0.6277, 0.4800, 0.7580],
+    ["CultriX/Qwen2.5-14B-Brocav2", 0.7492, 0.8302, 0.7508, 0.6377, 0.5100, 0.7478],
+    ["CultriX/Qwen2.5-14B-Brocav5", 0.7445, 0.8313, 0.7547, 0.6376, 0.5000, 0.7304],
+    ["CultriX/Qwen2.5-14B-Brocav6", 0.7179, 0.8354, 0.7531, 0.6378, 0.4900, 0.7524],
+    ["CultriX/Qwenfinity-2.5-14B", 0.7347, 0.8254, 0.7279, 0.7267, 0.5600, 0.6970],
+    ["CultriX/Qwen2.5-14B-Emergedv2", 0.7137, 0.8335, 0.7363, 0.5836, 0.4400, 0.7344],
+    ["CultriX/Qwen2.5-14B-Unity", 0.7063, 0.8343, 0.7423, 0.6820, 0.5700, 0.7498],
+    ["CultriX/Qwen2.5-14B-MultiCultyv3", 0.7132, 0.8216, 0.7395, 0.6792, 0.5500, 0.7120],
+    ["CultriX/Qwen2.5-14B-Emergedv3", 0.7436, 0.8312, 0.7519, 0.6585, 0.5500, 0.7068],
+    ["CultriX/SeQwence-14Bv1", 0.7278, 0.8410, 0.7541, 0.6816, 0.5200, 0.7539],
+    ["CultriX/Qwen2.5-14B-Wernickev2", 0.7391, 0.8168, 0.7273, 0.6220, 0.4500, 0.7572],
+    ["CultriX/Qwen2.5-14B-Wernickev3", 0.7357, 0.8148, 0.7245, 0.7023, 0.5500, 0.7869],
+    ["CultriX/Qwen2.5-14B-Wernickev4", 0.7355, 0.8290, 0.7497, 0.6306, 0.4800, 0.7635],
+    ["CultriX/SeQwential-14B-v1", 0.7355, 0.8205, 0.7549, 0.6367, 0.4800, 0.7626],
+    ["CultriX/Qwen2.5-14B-Wernickev5", 0.7224, 0.8272, 0.7541, 0.6790, 0.5100, 0.7578],
+    ["CultriX/Qwen2.5-14B-Wernickev6", 0.6994, 0.7549, 0.5816, 0.6991, 0.5800, 0.7267],
+    ["CultriX/Qwen2.5-14B-Wernickev7", 0.7147, 0.7599, 0.6097, 0.7056, 0.5700, 0.7164],
+    ["CultriX/Qwen2.5-14B-FinalMerge-tmp2", 0.7255, 0.8192, 0.7535, 0.6671, 0.5000, 0.7612],
+]
+columns = ["Model Configuration", "tinyArc", "tinyHellaswag", "tinyMMLU", "tinyTruthfulQA", "tinyTruthfulQA_mc1", "tinyWinogrande"]
+# Convert to DataFrame
+df_full = pd.DataFrame(data_full, columns=columns)
+def plot_average_scores():
+    df_full["Average Score"] = df_full.iloc[:, 1:].mean(axis=1)
+    df_avg_sorted = df_full.sort_values(by="Average Score", ascending=False)
     plt.figure(figsize=(12, 8))
+    plt.barh(df_avg_sorted["Model Configuration"], df_avg_sorted["Average Score"])
+    plt.title("Average Performance of Models Across Tasks", fontsize=16)
+    plt.xlabel("Average Score", fontsize=14)
+    plt.ylabel("Model Configuration", fontsize=14)
+    plt.gca().invert_yaxis()
+    plt.grid(axis='x', linestyle='--', alpha=0.7)
     plt.tight_layout()
+    plt.savefig("average_performance.png")
+    return "average_performance.png"
+def plot_task_performance():
+    df_full_melted = df_full.melt(id_vars="Model Configuration", var_name="Task", value_name="Score")
     plt.figure(figsize=(14, 10))
+    for model in df_full["Model Configuration"]:
+        model_data = df_full_melted[df_full_melted["Model Configuration"] == model]
+        plt.plot(model_data["Task"], model_data["Score"], marker="o", label=model)
+    plt.title("Performance of All Models Across Tasks", fontsize=16)
+    plt.xlabel("Task", fontsize=14)
+    plt.ylabel("Score", fontsize=14)
     plt.xticks(rotation=45)
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
+    plt.grid(axis='y', linestyle='--', alpha=0.7)
     plt.tight_layout()
+    plt.savefig("task_performance.png")
+    return "task_performance.png"
+def plot_task_specific_top_models():
+    top_models = df_full.iloc[:, :-1].set_index("Model Configuration").idxmax()
+    top_scores = df_full.iloc[:, :-1].set_index("Model Configuration").max()
+    results = pd.DataFrame({"Top Model": top_models, "Score": top_scores}).reset_index().rename(columns={"index": "Task"})
+    plt.figure(figsize=(12, 6))
+    plt.bar(results["Task"], results["Score"])
+    plt.title("Task-Specific Top Models", fontsize=16)
+    plt.xlabel("Task", fontsize=14)
+    plt.ylabel("Score", fontsize=14)
+    plt.grid(axis="y", linestyle="--", alpha=0.7)
     plt.tight_layout()
+    plt.savefig("task_specific_top_models.png")
+    return "task_specific_top_models.png"
+def top_3_models_per_task():
+    top_3_data = {
+        task: df_full.nlargest(3, task)[["Model Configuration", task]].values.tolist()
+        for task in df_full.columns[1:-1]
+    }
+    top_3_results = pd.DataFrame({
+        task: {
+            "Top 3 Models": [entry[0] for entry in top_3_data[task]],
+            "Scores": [entry[1] for entry in top_3_data[task]],
+        }
+        for task in top_3_data
+    }).T.rename_axis("Task").reset_index()
+    return top_3_results
+with gr.Blocks() as demo:
+    gr.Markdown("# Model Performance Analysis")
+    with gr.Row():
+        btn1 = gr.Button("Show Average Performance")
+        img1 = gr.Image(type="filepath")
+        btn1.click(plot_average_scores, outputs=img1)
+    with gr.Row():
+        btn2 = gr.Button("Show Task Performance")
+        img2 = gr.Image(type="filepath")
+        btn2.click(plot_task_performance, outputs=img2)
+    with gr.Row():
+        btn3 = gr.Button("Task-Specific Top Models")
+        img3 = gr.Image(type="filepath")
+        btn3.click(plot_task_specific_top_models, outputs=img3)
+    with gr.Row():
+        btn4 = gr.Button("Top 3 Models Per Task")
+        output4 = gr.Dataframe()
+        btn4.click(top_3_models_per_task, outputs=output4)
+demo.launch()