Spaces:

hysts
/

kyutai-stt-2.6b-en

Running on Zero

App Files Files Community

hysts HF Staff commited on Jun 25

Commit

54ed3c0

1 Parent(s): c8c4a59

Add files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
.pre-commit-config.yaml +33 -0
.python-version +1 -0
.vscode/extensions.json +8 -0
.vscode/settings.json +17 -0
README.md +1 -1
app.py +64 -0
assets/peter-piper.wav +3 -0
assets/seashells.wav +3 -0
pyproject.toml +59 -0
requirements.txt +319 -0
style.css +4 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.0
+    hooks:
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.16.1
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          [
+            "types-python-slugify",
+            "types-pytz",
+            "types-PyYAML",
+            "types-requests",
+          ]

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

.vscode/extensions.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "streetsidesoftware.code-spell-checker",
+        "tamasfe.even-better-toml"
+    ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "editor.formatOnSave": true,
+    "files.insertFinalNewline": false,
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnType": true,
+        "editor.codeActionsOnSave": {
+            "source.fixAll.ruff": "explicit",
+            "source.organizeImports": "explicit"
+        }
+    },
+    "[jupyter]": {
+        "files.insertFinalNewline": false
+    },
+    "notebook.output.scrolling": true,
+    "notebook.formatOnSave.enabled": true
+}

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Kyutai Stt 2.6b En
 emoji: 😻
 colorFrom: red
 colorTo: blue

 ---
+title: Kyutai STT 2.6B EN
 emoji: 😻
 colorFrom: red
 colorTo: blue

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+import os
+import pathlib
+import gradio as gr
+import librosa
+import spaces
+import torch
+from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor
+DESCRIPTION = "# Kyutai STT 2.6B EN"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "kyutai/stt-2.6b-en"
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device)
+processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+SAMPLE_RATE = 24000
+MAX_DURATION = int(os.getenv("MAX_DURATION", "60"))
+MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION
+@spaces.GPU
+def transcribe(audio_path: str) -> str:
+    """Transcribe an English audio file to text.
+    Args:
+        audio_path (str): The path to the audio file. The audio must contain English speech.
+    Returns:
+        str: The transcription of the English audio file.
+    """
+    data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
+    if len(data) > MAX_SAMPLE_SIZE:
+        data = data[:MAX_SAMPLE_SIZE]
+        gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.")
+    inputs = processor(data)
+    inputs.to(device)
+    output_tokens = model.generate(**inputs)
+    output = processor.batch_decode(output_tokens, skip_special_tokens=True)
+    return output[0]
+with gr.Blocks(css_paths="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            audio = gr.Audio(label="Audio", type="filepath")
+            run_button = gr.Button()
+        with gr.Column():
+            output = gr.Textbox(label="Transcription")
+    gr.Examples(
+        examples=sorted(pathlib.Path("assets").glob("*.wav")),
+        inputs=audio,
+        outputs=output,
+        fn=transcribe,
+    )
+    run_button.click(fn=transcribe, inputs=audio, outputs=output)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

assets/peter-piper.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e700f0918c28b292a09b58f862a033e26e6f050f7e8ea42cb3323349220a0273
+size 615644

assets/seashells.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b557afe5a1ee56012907b4583439b3e1ad6a3b93a6ebeed361bdf179f68a7571
+size 452444

pyproject.toml ADDED Viewed

	@@ -0,0 +1,59 @@

+[project]
+name = "kyutai-stt-2-6b-en"
+version = "0.1.0"
+description = ""
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.8.1",
+    "gradio[mcp]>=5.34.2",
+    "hf-transfer>=0.1.9",
+    "librosa>=0.11.0",
+    "spaces>=0.37.1",
+    "torch==2.5.1",
+    "transformers",
+]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "PD901",   # pandas-df-variable-name
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = ["T201", "T203"]
+[tool.ruff.format]
+docstring-code-format = true
+[tool.uv.sources]
+transformers = { git = "https://github.com/huggingface/transformers", rev = "v4.52.4-Kyutai-STT-preview" }

requirements.txt ADDED Viewed

	@@ -0,0 +1,319 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+accelerate==1.8.1
+    # via kyutai-stt-2-6b-en (pyproject.toml)
+aiofiles==24.1.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   mcp
+    #   sse-starlette
+    #   starlette
+audioread==3.0.1
+    # via librosa
+certifi==2025.6.15
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1
+    # via soundfile
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   typer
+    #   uvicorn
+decorator==5.2.1
+    # via librosa
+exceptiongroup==1.3.0
+    # via anyio
+fastapi==0.115.13
+    # via gradio
+ffmpy==0.6.0
+    # via gradio
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+fsspec==2025.5.1
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.34.2
+    # via
+    #   kyutai-stt-2-6b-en (pyproject.toml)
+    #   spaces
+gradio-client==1.10.3
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-transfer==0.1.9
+    # via kyutai-stt-2-6b-en (pyproject.toml)
+hf-xet==1.1.5
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   mcp
+    #   safehttpx
+    #   spaces
+httpx-sse==0.4.1
+    # via mcp
+huggingface-hub==0.33.0
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.1
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.4
+    # via librosa
+librosa==0.11.0
+    # via kyutai-stt-2-6b-en (pyproject.toml)
+llvmlite==0.44.0
+    # via numba
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mcp==1.9.3
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via librosa
+networkx==3.4.2
+    # via torch
+numba==0.61.2
+    # via librosa
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   gradio
+    #   librosa
+    #   numba
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   pooch
+    #   spaces
+    #   transformers
+pandas==2.3.0
+    # via gradio
+pillow==11.2.1
+    # via gradio
+platformdirs==4.3.8
+    # via pooch
+pooch==1.8.2
+    # via librosa
+psutil==5.9.8
+    # via
+    #   accelerate
+    #   spaces
+pycparser==2.22
+    # via cffi
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+    #   pydantic-settings
+    #   spaces
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.10.1
+    # via mcp
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.1
+    # via pydantic-settings
+python-multipart==0.0.20
+    # via
+    #   gradio
+    #   mcp
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.4
+    # via
+    #   huggingface-hub
+    #   pooch
+    #   spaces
+    #   transformers
+rich==14.0.0
+    # via typer
+ruff==0.12.0
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via
+    #   accelerate
+    #   transformers
+scikit-learn==1.7.0
+    # via librosa
+scipy==1.15.3
+    # via
+    #   librosa
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+soundfile==0.13.1
+    # via librosa
+soxr==0.5.0.post1
+    # via librosa
+spaces==0.37.1
+    # via kyutai-stt-2-6b-en (pyproject.toml)
+sse-starlette==2.3.6
+    # via mcp
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+sympy==1.13.1
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.21.2
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.5.1
+    # via
+    #   kyutai-stt-2-6b-en (pyproject.toml)
+    #   accelerate
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers @ git+https://github.com/huggingface/transformers@6bdd4ec95264e5d8f219cfe4ee29ea9b42474bb7
+    # via kyutai-stt-2-6b-en (pyproject.toml)
+triton==3.1.0
+    # via torch
+typer==0.16.0
+    # via gradio
+typing-extensions==4.14.0
+    # via
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   spaces
+    #   torch
+    #   typer
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.1
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via requests
+uvicorn==0.34.3
+    # via
+    #   gradio
+    #   mcp
+websockets==15.0.1
+    # via gradio-client

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+  display: block;
+}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff