hysts's picture
hysts HF Staff
Update
6b12f2c
#!/usr/bin/env python
import os
import pathlib
import gradio as gr
import librosa
import spaces
import torch
from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "kyutai/stt-2.6b-en-trfs"
model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device, torch_dtype="auto")
processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
SAMPLE_RATE = 24000
MAX_DURATION = int(os.getenv("MAX_DURATION", "60"))
MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION
@spaces.GPU
def transcribe(audio_path: str) -> str:
"""Transcribe an English audio file to text.
Args:
audio_path (str): The path to the audio file. The audio must contain English speech.
Returns:
str: The transcription of the English audio file.
"""
if not audio_path:
return ""
data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
if len(data) > MAX_SAMPLE_SIZE:
data = data[:MAX_SAMPLE_SIZE]
gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.")
inputs = processor(data)
inputs.to(device)
output_tokens = model.generate(**inputs)
output = processor.batch_decode(output_tokens, skip_special_tokens=True)
return output[0]
with gr.Blocks(fill_height=False) as demo:
# Header
gr.HTML("""
<div class="header-container">
<h1 class="header-title">πŸŽ™οΈ Kyutai Speech-to-Text</h1>
<p class="header-subtitle">Advanced English Audio Transcription powered by AI</p>
</div>
""")
# Info banner
gr.HTML(f"""
<div class="info-banner">
ℹ️ Upload or record audio in English (max {MAX_DURATION} seconds). Supports WAV, MP3, and other common formats.
</div>
""") # noqa: RUF001
# Main content
with gr.Group(elem_classes="main-card"):
# Audio input
audio = gr.Audio(
label="🎡 Audio Input",
type="filepath",
sources=["upload", "microphone"],
elem_classes="audio-container",
)
# Transcribe button
transcribe_btn = gr.Button(
"✨ Transcribe Audio",
variant="primary",
size="lg",
elem_classes="primary-button",
)
# Output
output = gr.Textbox(
label="πŸ“ Transcription",
placeholder="Your transcription will appear here...",
lines=6,
max_lines=12,
elem_classes="transcription-output",
)
# Examples section
with gr.Group(elem_classes="examples-container"):
gr.Markdown("### πŸ’‘ Try These Examples")
gr.Examples(
examples=sorted(pathlib.Path("assets").glob("*.wav")) if pathlib.Path("assets").exists() else [],
inputs=audio,
outputs=output,
fn=transcribe,
examples_per_page=5,
)
# Footer
gr.HTML("""
<div class="footer-container">
<p>
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="footer-link" target="_blank">anycoder</a> β€’
Powered by <a href="https://huggingface.co/kyutai/stt-2.6b-en-trfs" class="footer-link" target="_blank">Kyutai STT 2.6B</a>
</p>
</div>
""")
# Event handlers
transcribe_btn.click(
fn=transcribe,
inputs=audio,
outputs=output,
api_name="transcribe",
)
if __name__ == "__main__":
# Custom theme for modern, clean design
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="md",
radius_size="lg",
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
block_label_text_weight="500",
)
demo.launch(theme=theme, css_paths="style.css", mcp_server=True)