Spaces:

hysts
/

kyutai-stt-2.6b-en

Running on Zero

File size: 4,156 Bytes

54ed3c0
 
 
 
 
 
 
 
 
 
 
 
28519bb
 
54ed3c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d020ce8
 
1e9d7e6
54ed3c0
 
 
 
 
 
 
 
 
 
 
 
d020ce8
 
 
 
 
 
 
 
1e9d7e6
d020ce8
 
 
 
 
93d97ba
1e9d7e6
d020ce8
 
 
 
f391bbe
 
 
 
d020ce8
1e9d7e6
d020ce8
f391bbe
 
 
 
 
 
1e9d7e6
d020ce8
 
 
 
 
 
1e9d7e6
d020ce8
1e9d7e6
d020ce8
 
 
 
 
 
 
 
 
 
1e9d7e6
d020ce8
 
 
 
f15c490
d020ce8
 
 
 
1e9d7e6
d020ce8
35f4b88
 
 
 
 
 
1e9d7e6
54ed3c0
 
6b12f2c

#!/usr/bin/env python

import os
import pathlib

import gradio as gr
import librosa
import spaces
import torch
from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "kyutai/stt-2.6b-en-trfs"
model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device, torch_dtype="auto")
processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)

SAMPLE_RATE = 24000
MAX_DURATION = int(os.getenv("MAX_DURATION", "60"))
MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION


@spaces.GPU
def transcribe(audio_path: str) -> str:
    """Transcribe an English audio file to text.

    Args:
        audio_path (str): The path to the audio file. The audio must contain English speech.

    Returns:
        str: The transcription of the English audio file.
    """
    if not audio_path:
        return ""

    data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
    if len(data) > MAX_SAMPLE_SIZE:
        data = data[:MAX_SAMPLE_SIZE]
        gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.")

    inputs = processor(data)
    inputs.to(device)
    output_tokens = model.generate(**inputs)
    output = processor.batch_decode(output_tokens, skip_special_tokens=True)
    return output[0]


with gr.Blocks(fill_height=False) as demo:
    # Header
    gr.HTML("""
        <div class="header-container">
            <h1 class="header-title">🎙️ Kyutai Speech-to-Text</h1>
            <p class="header-subtitle">Advanced English Audio Transcription powered by AI</p>
        </div>
    """)

    # Info banner
    gr.HTML(f"""
        <div class="info-banner">
            ℹ️ Upload or record audio in English (max {MAX_DURATION} seconds). Supports WAV, MP3, and other common formats.
        </div>
    """)  # noqa: RUF001

    # Main content
    with gr.Group(elem_classes="main-card"):
        # Audio input
        audio = gr.Audio(
            label="🎵 Audio Input",
            type="filepath",
            sources=["upload", "microphone"],
            elem_classes="audio-container",
        )

        # Transcribe button
        transcribe_btn = gr.Button(
            "✨ Transcribe Audio",
            variant="primary",
            size="lg",
            elem_classes="primary-button",
        )

        # Output
        output = gr.Textbox(
            label="📝 Transcription",
            placeholder="Your transcription will appear here...",
            lines=6,
            max_lines=12,
            elem_classes="transcription-output",
        )

    # Examples section
    with gr.Group(elem_classes="examples-container"):
        gr.Markdown("### 💡 Try These Examples")
        gr.Examples(
            examples=sorted(pathlib.Path("assets").glob("*.wav")) if pathlib.Path("assets").exists() else [],
            inputs=audio,
            outputs=output,
            fn=transcribe,
            examples_per_page=5,
        )

    # Footer
    gr.HTML("""
        <div class="footer-container">
            <p>
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="footer-link" target="_blank">anycoder</a> •
                Powered by <a href="https://huggingface.co/kyutai/stt-2.6b-en-trfs" class="footer-link" target="_blank">Kyutai STT 2.6B</a>
            </p>
        </div>
    """)

    # Event handlers
    transcribe_btn.click(
        fn=transcribe,
        inputs=audio,
        outputs=output,
        api_name="transcribe",
    )


if __name__ == "__main__":
    # Custom theme for modern, clean design
    theme = gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        text_size="lg",
        spacing_size="md",
        radius_size="lg",
    ).set(
        button_primary_background_fill="*primary_600",
        button_primary_background_fill_hover="*primary_700",
        block_title_text_weight="600",
        block_label_text_weight="500",
    )
    demo.launch(theme=theme, css_paths="style.css", mcp_server=True)