Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python | |
| import os | |
| import pathlib | |
| import gradio as gr | |
| import librosa | |
| import spaces | |
| import torch | |
| from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_id = "kyutai/stt-2.6b-en-trfs" | |
| model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device, torch_dtype="auto") | |
| processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) | |
| SAMPLE_RATE = 24000 | |
| MAX_DURATION = int(os.getenv("MAX_DURATION", "60")) | |
| MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION | |
| def transcribe(audio_path: str) -> str: | |
| """Transcribe an English audio file to text. | |
| Args: | |
| audio_path (str): The path to the audio file. The audio must contain English speech. | |
| Returns: | |
| str: The transcription of the English audio file. | |
| """ | |
| if not audio_path: | |
| return "" | |
| data, _ = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| if len(data) > MAX_SAMPLE_SIZE: | |
| data = data[:MAX_SAMPLE_SIZE] | |
| gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.") | |
| inputs = processor(data) | |
| inputs.to(device) | |
| output_tokens = model.generate(**inputs) | |
| output = processor.batch_decode(output_tokens, skip_special_tokens=True) | |
| return output[0] | |
| with gr.Blocks(fill_height=False) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-container"> | |
| <h1 class="header-title">ποΈ Kyutai Speech-to-Text</h1> | |
| <p class="header-subtitle">Advanced English Audio Transcription powered by AI</p> | |
| </div> | |
| """) | |
| # Info banner | |
| gr.HTML(f""" | |
| <div class="info-banner"> | |
| βΉοΈ Upload or record audio in English (max {MAX_DURATION} seconds). Supports WAV, MP3, and other common formats. | |
| </div> | |
| """) # noqa: RUF001 | |
| # Main content | |
| with gr.Group(elem_classes="main-card"): | |
| # Audio input | |
| audio = gr.Audio( | |
| label="π΅ Audio Input", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| elem_classes="audio-container", | |
| ) | |
| # Transcribe button | |
| transcribe_btn = gr.Button( | |
| "β¨ Transcribe Audio", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="primary-button", | |
| ) | |
| # Output | |
| output = gr.Textbox( | |
| label="π Transcription", | |
| placeholder="Your transcription will appear here...", | |
| lines=6, | |
| max_lines=12, | |
| elem_classes="transcription-output", | |
| ) | |
| # Examples section | |
| with gr.Group(elem_classes="examples-container"): | |
| gr.Markdown("### π‘ Try These Examples") | |
| gr.Examples( | |
| examples=sorted(pathlib.Path("assets").glob("*.wav")) if pathlib.Path("assets").exists() else [], | |
| inputs=audio, | |
| outputs=output, | |
| fn=transcribe, | |
| examples_per_page=5, | |
| ) | |
| # Footer | |
| gr.HTML(""" | |
| <div class="footer-container"> | |
| <p> | |
| Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="footer-link" target="_blank">anycoder</a> β’ | |
| Powered by <a href="https://huggingface.co/kyutai/stt-2.6b-en-trfs" class="footer-link" target="_blank">Kyutai STT 2.6B</a> | |
| </p> | |
| </div> | |
| """) | |
| # Event handlers | |
| transcribe_btn.click( | |
| fn=transcribe, | |
| inputs=audio, | |
| outputs=output, | |
| api_name="transcribe", | |
| ) | |
| if __name__ == "__main__": | |
| # Custom theme for modern, clean design | |
| theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="md", | |
| radius_size="lg", | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| block_label_text_weight="500", | |
| ) | |
| demo.launch(theme=theme, css_paths="style.css", mcp_server=True) | |