Spaces:

siro1
/

amd-leaderboard

Sleeping

App Files Files Community

siro1 commited on Jun 28

Commit

48bb3eb

0 Parent(s):

Initial commit

Browse files

Files changed (9) hide show

.gitignore +13 -0
.python-version +1 -0
CLAUDE.md +49 -0
README.md +45 -0
app.py +150 -0
populate_dataset.py +69 -0
pyproject.toml +13 -0
src/utils.py +64 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.gradio

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+This is a Python project called "amd-leaderboard" that appears to be designed for tracking performance metrics or benchmarks in a leaderboard format.
+## Development Environment
+- **Python Version**: 3.13 (specified in `.python-version`)
+- **Package Manager**: uv (modern Python package manager)
+- **Project Configuration**: `pyproject.toml`
+## Common Commands
+### Package Management
+```bash
+# Install dependencies
+uv sync
+# Add a new dependency
+uv add <package_name>
+# Add a development dependency
+uv add --dev <package_name>
+```
+### Running the Application
+```bash
+# Run the main application
+uv run python main.py
+# Run any Python file with the project's environment
+uv run python <file_path>
+```
+## Project Structure
+The codebase follows a simple Python package structure:
+- `main.py` - Main entry point (currently empty)
+- `src/` - Source code directory
+  - `utils.py` - Contains a results data structure template
+## Key Information
+1. The project uses modern Python tooling with `uv` as the package manager and `pyproject.toml` for configuration
+2. Currently has no external dependencies installed
+3. The `src/utils.py` file contains a template structure that suggests the project will handle results with task names and metric values in a dictionary format

README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+---
+title: AMD vLLM Benchmark Leaderboard
+emoji: <�
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+---
+# AMD vLLM Benchmark Leaderboard
+A leaderboard for tracking performance and accuracy metrics of vLLM benchmarks on AMD hardware.
+## Features
+- **Performance Metrics Tracking**: TTFT, TPOT, ITL, E2E Latency, and Throughput
+- **Accuracy Metrics**: WikiText perplexity scores
+- **Manual Result Submission**: Enter benchmark results through the web interface
+- **JSON Upload**: Direct upload of benchmark result files from vLLM
+- **Sortable Leaderboard**: Compare results across different runs and configurations
+## Metrics
+- **TTFT (Time To First Token)**: Time to generate the first token (ms)
+- **TPOT (Time Per Output Token)**: Average time per token generation (ms)
+- **ITL (Inter-Token Latency)**: Latency between tokens (ms)
+- **E2E Latency**: End-to-end request latency (ms)
+- **Throughput**: Total tokens processed per second
+- **WikiText Perplexity**: Model accuracy metric (lower is better)
+## Usage
+1. **View Results**: Check the Leaderboard tab to see all benchmark results
+2. **Submit Results**: Use the Submit Results tab to add new benchmark data
+   - Manual entry for individual metrics
+   - JSON upload for direct benchmark output files
+3. **Compare**: Sort and analyze results to identify best configurations
+## Integration with vLLM Benchmarks
+This leaderboard is designed to work with output from:
+- `vllm/benchmarks/benchmark_serving.py` for performance metrics
+- `lm-evaluation-harness` for accuracy metrics

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gradio as gr
+import pandas as pd
+from datetime import datetime
+import os
+from datasets import load_dataset, Dataset
+# Configuration
+DATASET_ID = "siro1/amd-hackathon"
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Optional: for write access
+dataset = None
+def load_results():
+    global dataset
+    """Load results from Hugging Face dataset"""
+    dataset = load_dataset(DATASET_ID, split="train")
+    # Convert dataset to dictionary format matching the expected structure
+    results = dataset.map(
+        lambda item: {
+            "Team": item["team"],
+            "Timestamp": item["timestamp"],
+            "TTFT (ms)": item["ttft"],
+            "TPOT (ms)": item["tpot"],
+            "ITL (ms)": item["itl"],
+            "E2E Latency (ms)": item["e2e"],
+            "Throughput (tokens/s)": item["throughput"],
+            "Bits per Byte": item["bits_per_byte"],
+            "Byte Perplexity": item["byte_perplexity"],
+            "Word Perplexity": item["word_perplexity"],
+        },
+        batch_size=64,
+        remove_columns=dataset.column_names,
+    )
+    df = results.to_pandas()
+    df = df.sort_values("Throughput (tokens/s)", ascending=False)
+    return df
+def update_dataset(
+    team_name,
+    ttft,
+    tpot,
+    itl,
+    e2e,
+    throughput,
+    bits_per_byte,
+    byte_perplexity,
+    word_perplexity,
+):
+    """Insert a new row into the Hugging Face dataset"""
+    existing_data = dataset.to_list()
+    new_entry = {
+        "team": team_name,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "ttft": float(ttft),
+        "tpot": float(tpot),
+        "itl": float(itl),
+        "e2e": float(e2e),
+        "throughput": float(throughput),
+        "bits_per_byte": float(bits_per_byte),
+        "byte_perplexity": float(byte_perplexity),
+        "word_perplexity": float(word_perplexity),
+    }
+    existing_data.append(new_entry)
+    updated_dataset = Dataset.from_list(existing_data)
+    updated_dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)
+    return True
+def api_submit_results(
+    team_name: str,
+    ttft: float,
+    tpot: float,
+    itl: float,
+    e2e: float,
+    throughput: float,
+    bits_per_byte: float,
+    byte_perplexity: float,
+    word_perplexity: float,
+) -> str:
+    try:
+        # Update the dataset with new submission
+        success = update_dataset(
+            team_name=team_name,
+            ttft=ttft,
+            tpot=tpot,
+            itl=itl,
+            e2e=e2e,
+            throughput=throughput,
+            bits_per_byte=bits_per_byte,
+            byte_perplexity=byte_perplexity,
+            word_perplexity=word_perplexity,
+        )
+        if success:
+            return f"Your submission for {team_name} has been accepted 🤗"
+        else:
+            return f"Failed to submit results for {team_name} 😢"
+    except Exception as e:
+        return f"Failed to submit results for {team_name} 😢: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="AMD vLLM Benchmark Leaderboard") as demo:
+        gr.Markdown("# AMD vLLM Benchmark Leaderboard")
+        gr.Markdown(
+            "Track and compare performance and accuracy metrics for vLLM benchmarks"
+        )
+        with gr.Tab("Leaderboard"):
+            # Function to refresh leaderboard
+            def refresh_leaderboard():
+                return load_results()
+            # Initial load
+            leaderboard_table = gr.DataFrame(
+                value=refresh_leaderboard(),
+                label="Benchmark Results",
+                interactive=False,
+            )
+            refresh_btn = gr.Button("Refresh Leaderboard")
+            refresh_btn.click(
+                fn=refresh_leaderboard,
+                outputs=leaderboard_table,
+            )
+        gr.api(
+            fn=api_submit_results,
+            api_name="submit_results",
+        )
+    return demo
+# Create and launch the app
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.queue()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+    )

populate_dataset.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+"""
+Script to populate the Hugging Face dataset with mock data
+"""
+from datasets import Dataset
+from datetime import datetime, timedelta
+import random
+# Configuration
+DATASET_ID = "siro1/amd-hackathon"
+HF_TOKEN = None  # Set this if needed for private repos
+# Generate mock data
+mock_data = []
+teams = ["Team Alpha", "Team Beta", "Team Gamma", "Team Delta", "Team Epsilon"]
+base_date = datetime.now() - timedelta(days=7)
+for i in range(10):
+    team = random.choice(teams)
+    timestamp = (base_date + timedelta(days=i / 2)).strftime("%Y-%m-%d %H:%M:%S")
+    # Vary configurations
+    input_length = random.choice([128, 256, 512])
+    output_length = random.choice([128, 256, 512])
+    concurrent_requests = random.choice([8, 16, 32, 64])
+    # Generate performance metrics with some variance
+    base_ttft = 40 + random.uniform(-10, 10)
+    base_tpot = 11 + random.uniform(-2, 2)
+    base_itl = 10 + random.uniform(-2, 2)
+    base_e2e = 1500 + (input_length + output_length) * 2 + random.uniform(-200, 200)
+    base_throughput = 2000 + concurrent_requests * 20 + random.uniform(-200, 200)
+    bits_per_byte = 0.54 + random.uniform(-0.02, 0.02)
+    byte_perplexity = 1.45 + random.uniform(-0.02, 0.02)
+    word_perplexity = 4.13 + random.uniform(-0.02, 0.02)
+    entry = {
+        "team": team,
+        "timestamp": timestamp,
+        "ttft": round(base_ttft, 2),
+        "tpot": round(base_tpot, 2),
+        "itl": round(base_itl, 2),
+        "e2e": round(base_e2e, 2),
+        "throughput": round(base_throughput, 2),
+        "bits_per_byte": round(bits_per_byte, 2),
+        "byte_perplexity": round(byte_perplexity, 2),
+        "word_perplexity": round(word_perplexity, 2),
+    }
+    mock_data.append(entry)
+# Sort by timestamp
+mock_data.sort(key=lambda x: x["timestamp"])
+# Create dataset and push to hub
+print(f"Creating dataset with {len(mock_data)} entries...")
+dataset = Dataset.from_list(mock_data)
+print(f"Pushing to Hugging Face Hub: {DATASET_ID}")
+dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)
+print("Dataset populated successfully!")
+print("\nSample entries:")
+for entry in mock_data[:3]:
+    print(
+        f"- {entry['team']} at {entry['timestamp']}: throughput={entry['throughput']}"
+    )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "amd-leaderboard"
+version = "0.1.0"
+description = "A leaderboard for tracking vLLM benchmark results on AMD hardware"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "datasets>=3.6.0",
+    "gradio>=5.35.0",
+    "huggingface-hub>=0.33.1",
+    "ipython>=9.3.0",
+    "pandas>=2.3.0",
+]

src/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Example result structure for vLLM benchmarks
+RESULT_SCHEMA = {
+    "run_id": {
+        "model": "model_name",
+        "timestamp": "YYYY-MM-DD HH:MM:SS",
+        "config": {
+            "input_length": 128,
+            "output_length": 128,
+            "concurrent_requests": 16
+        },
+        "performance": {
+            "median_ttft_ms": 0.0,      # Time To First Token
+            "median_tpot_ms": 0.0,      # Time Per Output Token
+            "median_itl_ms": 0.0,       # Inter-Token Latency
+            "median_e2el_ms": 0.0,      # End-to-End Latency
+            "total_token_throughput": 0.0
+        },
+        "accuracy": {
+            "wikitext_perplexity": 0.0
+        }
+    }
+}
+# Mock data for testing
+MOCK_RESULTS = {
+    "run_2024_12_01_baseline": {
+        "model": "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV",
+        "timestamp": "2024-12-01 10:30:00",
+        "config": {
+            "input_length": 128,
+            "output_length": 128,
+            "concurrent_requests": 16
+        },
+        "performance": {
+            "median_ttft_ms": 45.23,
+            "median_tpot_ms": 12.56,
+            "median_itl_ms": 11.89,
+            "median_e2el_ms": 1589.45,
+            "total_token_throughput": 2048.67
+        },
+        "accuracy": {
+            "wikitext_perplexity": 7.89
+        }
+    },
+    "run_2024_12_02_optimized": {
+        "model": "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV",
+        "timestamp": "2024-12-02 14:15:00",
+        "config": {
+            "input_length": 128,
+            "output_length": 128,
+            "concurrent_requests": 16
+        },
+        "performance": {
+            "median_ttft_ms": 42.11,
+            "median_tpot_ms": 11.23,
+            "median_itl_ms": 10.95,
+            "median_e2el_ms": 1456.78,
+            "total_token_throughput": 2234.89
+        },
+        "accuracy": {
+            "wikitext_perplexity": 7.91
+        }
+    }
+}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff