hysts HF Staff commited on
Commit
54ed3c0
·
1 Parent(s): c8c4a59
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.12.0
18
+ hooks:
19
+ - id: ruff-check
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.16.1
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
.vscode/extensions.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "streetsidesoftware.code-spell-checker",
6
+ "tamasfe.even-better-toml"
7
+ ]
8
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "[jupyter]": {
13
+ "files.insertFinalNewline": false
14
+ },
15
+ "notebook.output.scrolling": true,
16
+ "notebook.formatOnSave.enabled": true
17
+ }
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Kyutai Stt 2.6b En
3
  emoji: 😻
4
  colorFrom: red
5
  colorTo: blue
 
1
  ---
2
+ title: Kyutai STT 2.6B EN
3
  emoji: 😻
4
  colorFrom: red
5
  colorTo: blue
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import pathlib
5
+
6
+ import gradio as gr
7
+ import librosa
8
+ import spaces
9
+ import torch
10
+ from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor
11
+
12
+ DESCRIPTION = "# Kyutai STT 2.6B EN"
13
+
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model_id = "kyutai/stt-2.6b-en"
16
+ model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device)
17
+ processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
18
+
19
+ SAMPLE_RATE = 24000
20
+ MAX_DURATION = int(os.getenv("MAX_DURATION", "60"))
21
+ MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION
22
+
23
+
24
+ @spaces.GPU
25
+ def transcribe(audio_path: str) -> str:
26
+ """Transcribe an English audio file to text.
27
+
28
+ Args:
29
+ audio_path (str): The path to the audio file. The audio must contain English speech.
30
+
31
+ Returns:
32
+ str: The transcription of the English audio file.
33
+ """
34
+ data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
35
+ if len(data) > MAX_SAMPLE_SIZE:
36
+ data = data[:MAX_SAMPLE_SIZE]
37
+ gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.")
38
+
39
+ inputs = processor(data)
40
+ inputs.to(device)
41
+ output_tokens = model.generate(**inputs)
42
+ output = processor.batch_decode(output_tokens, skip_special_tokens=True)
43
+ return output[0]
44
+
45
+
46
+ with gr.Blocks(css_paths="style.css") as demo:
47
+ gr.Markdown(DESCRIPTION)
48
+ with gr.Row():
49
+ with gr.Column():
50
+ audio = gr.Audio(label="Audio", type="filepath")
51
+ run_button = gr.Button()
52
+ with gr.Column():
53
+ output = gr.Textbox(label="Transcription")
54
+ gr.Examples(
55
+ examples=sorted(pathlib.Path("assets").glob("*.wav")),
56
+ inputs=audio,
57
+ outputs=output,
58
+ fn=transcribe,
59
+ )
60
+
61
+ run_button.click(fn=transcribe, inputs=audio, outputs=output)
62
+
63
+ if __name__ == "__main__":
64
+ demo.launch(mcp_server=True)
assets/peter-piper.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e700f0918c28b292a09b58f862a033e26e6f050f7e8ea42cb3323349220a0273
3
+ size 615644
assets/seashells.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b557afe5a1ee56012907b4583439b3e1ad6a3b93a6ebeed361bdf179f68a7571
3
+ size 452444
pyproject.toml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "kyutai-stt-2-6b-en"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "accelerate>=1.8.1",
9
+ "gradio[mcp]>=5.34.2",
10
+ "hf-transfer>=0.1.9",
11
+ "librosa>=0.11.0",
12
+ "spaces>=0.37.1",
13
+ "torch==2.5.1",
14
+ "transformers",
15
+ ]
16
+
17
+ [tool.ruff]
18
+ line-length = 119
19
+
20
+ [tool.ruff.lint]
21
+ select = ["ALL"]
22
+ ignore = [
23
+ "COM812", # missing-trailing-comma
24
+ "D203", # one-blank-line-before-class
25
+ "D213", # multi-line-summary-second-line
26
+ "E501", # line-too-long
27
+ "SIM117", # multiple-with-statements
28
+ #
29
+ "D100", # undocumented-public-module
30
+ "D101", # undocumented-public-class
31
+ "D102", # undocumented-public-method
32
+ "D103", # undocumented-public-function
33
+ "D104", # undocumented-public-package
34
+ "D105", # undocumented-magic-method
35
+ "D107", # undocumented-public-init
36
+ "EM101", # raw-string-in-exception
37
+ "FBT001", # boolean-type-hint-positional-argument
38
+ "FBT002", # boolean-default-value-positional-argument
39
+ "PD901", # pandas-df-variable-name
40
+ "PGH003", # blanket-type-ignore
41
+ "PLR0913", # too-many-arguments
42
+ "PLR0915", # too-many-statements
43
+ "TRY003", # raise-vanilla-args
44
+ ]
45
+ unfixable = [
46
+ "F401", # unused-import
47
+ ]
48
+
49
+ [tool.ruff.lint.pydocstyle]
50
+ convention = "google"
51
+
52
+ [tool.ruff.lint.per-file-ignores]
53
+ "*.ipynb" = ["T201", "T203"]
54
+
55
+ [tool.ruff.format]
56
+ docstring-code-format = true
57
+
58
+ [tool.uv.sources]
59
+ transformers = { git = "https://github.com/huggingface/transformers", rev = "v4.52.4-Kyutai-STT-preview" }
requirements.txt ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ accelerate==1.8.1
4
+ # via kyutai-stt-2-6b-en (pyproject.toml)
5
+ aiofiles==24.1.0
6
+ # via gradio
7
+ annotated-types==0.7.0
8
+ # via pydantic
9
+ anyio==4.9.0
10
+ # via
11
+ # gradio
12
+ # httpx
13
+ # mcp
14
+ # sse-starlette
15
+ # starlette
16
+ audioread==3.0.1
17
+ # via librosa
18
+ certifi==2025.6.15
19
+ # via
20
+ # httpcore
21
+ # httpx
22
+ # requests
23
+ cffi==1.17.1
24
+ # via soundfile
25
+ charset-normalizer==3.4.2
26
+ # via requests
27
+ click==8.2.1
28
+ # via
29
+ # typer
30
+ # uvicorn
31
+ decorator==5.2.1
32
+ # via librosa
33
+ exceptiongroup==1.3.0
34
+ # via anyio
35
+ fastapi==0.115.13
36
+ # via gradio
37
+ ffmpy==0.6.0
38
+ # via gradio
39
+ filelock==3.18.0
40
+ # via
41
+ # huggingface-hub
42
+ # torch
43
+ # transformers
44
+ # triton
45
+ fsspec==2025.5.1
46
+ # via
47
+ # gradio-client
48
+ # huggingface-hub
49
+ # torch
50
+ gradio==5.34.2
51
+ # via
52
+ # kyutai-stt-2-6b-en (pyproject.toml)
53
+ # spaces
54
+ gradio-client==1.10.3
55
+ # via gradio
56
+ groovy==0.1.2
57
+ # via gradio
58
+ h11==0.16.0
59
+ # via
60
+ # httpcore
61
+ # uvicorn
62
+ hf-transfer==0.1.9
63
+ # via kyutai-stt-2-6b-en (pyproject.toml)
64
+ hf-xet==1.1.5
65
+ # via huggingface-hub
66
+ httpcore==1.0.9
67
+ # via httpx
68
+ httpx==0.28.1
69
+ # via
70
+ # gradio
71
+ # gradio-client
72
+ # mcp
73
+ # safehttpx
74
+ # spaces
75
+ httpx-sse==0.4.1
76
+ # via mcp
77
+ huggingface-hub==0.33.0
78
+ # via
79
+ # accelerate
80
+ # gradio
81
+ # gradio-client
82
+ # tokenizers
83
+ # transformers
84
+ idna==3.10
85
+ # via
86
+ # anyio
87
+ # httpx
88
+ # requests
89
+ jinja2==3.1.6
90
+ # via
91
+ # gradio
92
+ # torch
93
+ joblib==1.5.1
94
+ # via
95
+ # librosa
96
+ # scikit-learn
97
+ lazy-loader==0.4
98
+ # via librosa
99
+ librosa==0.11.0
100
+ # via kyutai-stt-2-6b-en (pyproject.toml)
101
+ llvmlite==0.44.0
102
+ # via numba
103
+ markdown-it-py==3.0.0
104
+ # via rich
105
+ markupsafe==3.0.2
106
+ # via
107
+ # gradio
108
+ # jinja2
109
+ mcp==1.9.3
110
+ # via gradio
111
+ mdurl==0.1.2
112
+ # via markdown-it-py
113
+ mpmath==1.3.0
114
+ # via sympy
115
+ msgpack==1.1.1
116
+ # via librosa
117
+ networkx==3.4.2
118
+ # via torch
119
+ numba==0.61.2
120
+ # via librosa
121
+ numpy==2.2.6
122
+ # via
123
+ # accelerate
124
+ # gradio
125
+ # librosa
126
+ # numba
127
+ # pandas
128
+ # scikit-learn
129
+ # scipy
130
+ # soundfile
131
+ # soxr
132
+ # transformers
133
+ nvidia-cublas-cu12==12.4.5.8
134
+ # via
135
+ # nvidia-cudnn-cu12
136
+ # nvidia-cusolver-cu12
137
+ # torch
138
+ nvidia-cuda-cupti-cu12==12.4.127
139
+ # via torch
140
+ nvidia-cuda-nvrtc-cu12==12.4.127
141
+ # via torch
142
+ nvidia-cuda-runtime-cu12==12.4.127
143
+ # via torch
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ # via torch
146
+ nvidia-cufft-cu12==11.2.1.3
147
+ # via torch
148
+ nvidia-curand-cu12==10.3.5.147
149
+ # via torch
150
+ nvidia-cusolver-cu12==11.6.1.9
151
+ # via torch
152
+ nvidia-cusparse-cu12==12.3.1.170
153
+ # via
154
+ # nvidia-cusolver-cu12
155
+ # torch
156
+ nvidia-nccl-cu12==2.21.5
157
+ # via torch
158
+ nvidia-nvjitlink-cu12==12.4.127
159
+ # via
160
+ # nvidia-cusolver-cu12
161
+ # nvidia-cusparse-cu12
162
+ # torch
163
+ nvidia-nvtx-cu12==12.4.127
164
+ # via torch
165
+ orjson==3.10.18
166
+ # via gradio
167
+ packaging==25.0
168
+ # via
169
+ # accelerate
170
+ # gradio
171
+ # gradio-client
172
+ # huggingface-hub
173
+ # lazy-loader
174
+ # pooch
175
+ # spaces
176
+ # transformers
177
+ pandas==2.3.0
178
+ # via gradio
179
+ pillow==11.2.1
180
+ # via gradio
181
+ platformdirs==4.3.8
182
+ # via pooch
183
+ pooch==1.8.2
184
+ # via librosa
185
+ psutil==5.9.8
186
+ # via
187
+ # accelerate
188
+ # spaces
189
+ pycparser==2.22
190
+ # via cffi
191
+ pydantic==2.11.7
192
+ # via
193
+ # fastapi
194
+ # gradio
195
+ # mcp
196
+ # pydantic-settings
197
+ # spaces
198
+ pydantic-core==2.33.2
199
+ # via pydantic
200
+ pydantic-settings==2.10.1
201
+ # via mcp
202
+ pydub==0.25.1
203
+ # via gradio
204
+ pygments==2.19.2
205
+ # via rich
206
+ python-dateutil==2.9.0.post0
207
+ # via pandas
208
+ python-dotenv==1.1.1
209
+ # via pydantic-settings
210
+ python-multipart==0.0.20
211
+ # via
212
+ # gradio
213
+ # mcp
214
+ pytz==2025.2
215
+ # via pandas
216
+ pyyaml==6.0.2
217
+ # via
218
+ # accelerate
219
+ # gradio
220
+ # huggingface-hub
221
+ # transformers
222
+ regex==2024.11.6
223
+ # via transformers
224
+ requests==2.32.4
225
+ # via
226
+ # huggingface-hub
227
+ # pooch
228
+ # spaces
229
+ # transformers
230
+ rich==14.0.0
231
+ # via typer
232
+ ruff==0.12.0
233
+ # via gradio
234
+ safehttpx==0.1.6
235
+ # via gradio
236
+ safetensors==0.5.3
237
+ # via
238
+ # accelerate
239
+ # transformers
240
+ scikit-learn==1.7.0
241
+ # via librosa
242
+ scipy==1.15.3
243
+ # via
244
+ # librosa
245
+ # scikit-learn
246
+ semantic-version==2.10.0
247
+ # via gradio
248
+ shellingham==1.5.4
249
+ # via typer
250
+ six==1.17.0
251
+ # via python-dateutil
252
+ sniffio==1.3.1
253
+ # via anyio
254
+ soundfile==0.13.1
255
+ # via librosa
256
+ soxr==0.5.0.post1
257
+ # via librosa
258
+ spaces==0.37.1
259
+ # via kyutai-stt-2-6b-en (pyproject.toml)
260
+ sse-starlette==2.3.6
261
+ # via mcp
262
+ starlette==0.46.2
263
+ # via
264
+ # fastapi
265
+ # gradio
266
+ # mcp
267
+ sympy==1.13.1
268
+ # via torch
269
+ threadpoolctl==3.6.0
270
+ # via scikit-learn
271
+ tokenizers==0.21.2
272
+ # via transformers
273
+ tomlkit==0.13.3
274
+ # via gradio
275
+ torch==2.5.1
276
+ # via
277
+ # kyutai-stt-2-6b-en (pyproject.toml)
278
+ # accelerate
279
+ tqdm==4.67.1
280
+ # via
281
+ # huggingface-hub
282
+ # transformers
283
+ transformers @ git+https://github.com/huggingface/transformers@6bdd4ec95264e5d8f219cfe4ee29ea9b42474bb7
284
+ # via kyutai-stt-2-6b-en (pyproject.toml)
285
+ triton==3.1.0
286
+ # via torch
287
+ typer==0.16.0
288
+ # via gradio
289
+ typing-extensions==4.14.0
290
+ # via
291
+ # anyio
292
+ # exceptiongroup
293
+ # fastapi
294
+ # gradio
295
+ # gradio-client
296
+ # huggingface-hub
297
+ # librosa
298
+ # pydantic
299
+ # pydantic-core
300
+ # rich
301
+ # spaces
302
+ # torch
303
+ # typer
304
+ # typing-inspection
305
+ # uvicorn
306
+ typing-inspection==0.4.1
307
+ # via
308
+ # pydantic
309
+ # pydantic-settings
310
+ tzdata==2025.2
311
+ # via pandas
312
+ urllib3==2.5.0
313
+ # via requests
314
+ uvicorn==0.34.3
315
+ # via
316
+ # gradio
317
+ # mcp
318
+ websockets==15.0.1
319
+ # via gradio-client
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
uv.lock ADDED
The diff for this file is too large to render. See raw diff