Spaces:
Sleeping
Sleeping
Michael Hu
commited on
Commit
·
933cc7f
1
Parent(s):
2d46a24
tts is deprecated, use fish speech
Browse files- app.py +1 -1
- config/tts_config.yaml +13 -0
- download_models.py +5 -0
- requirements.txt +3 -6
- utils/tts.py +45 -37
app.py
CHANGED
|
@@ -55,7 +55,7 @@ def handle_file_processing(upload_path):
|
|
| 55 |
|
| 56 |
# TTS Phase
|
| 57 |
status_text.markdown("🎵 **Generating Chinese Speech...**")
|
| 58 |
-
output_path = generate_speech(chinese_text)
|
| 59 |
progress_bar.progress(100)
|
| 60 |
|
| 61 |
# Display results
|
|
|
|
| 55 |
|
| 56 |
# TTS Phase
|
| 57 |
status_text.markdown("🎵 **Generating Chinese Speech...**")
|
| 58 |
+
output_path = generate_speech(chinese_text,language="zh")
|
| 59 |
progress_bar.progress(100)
|
| 60 |
|
| 61 |
# Display results
|
config/tts_config.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tts:
|
| 2 |
+
model: fish-speech-400m-v1
|
| 3 |
+
vocoder: hifigan-v1
|
| 4 |
+
device: auto
|
| 5 |
+
precision: fp16
|
| 6 |
+
|
| 7 |
+
generation:
|
| 8 |
+
temperature: 0.7
|
| 9 |
+
top_k: 20
|
| 10 |
+
max_length: 4096
|
| 11 |
+
language_mapping:
|
| 12 |
+
zh: "[ZH]{text}[ZH]"
|
| 13 |
+
en: "[EN]{text}[EN]"
|
download_models.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_audio.sdk.utils import download_all_models
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
download_all_models()
|
| 5 |
+
print("All models downloaded to ~/.cache/fish_audio")
|
requirements.txt
CHANGED
|
@@ -4,10 +4,7 @@ nltk>=3.8
|
|
| 4 |
librosa>=0.10
|
| 5 |
soundfile>=0.12
|
| 6 |
ffmpeg-python>=0.2
|
| 7 |
-
torch>=2.0,<3.0
|
| 8 |
transformers[audio]>=4.33
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
scipy~=1.11.0 # 与 NumPy 1.x 兼容的 SciPy 版本
|
| 13 |
-
scikit-learn~=1.3.0 # 兼容旧版 NumPy 的 scikit-learn
|
|
|
|
| 4 |
librosa>=0.10
|
| 5 |
soundfile>=0.12
|
| 6 |
ffmpeg-python>=0.2
|
|
|
|
| 7 |
transformers[audio]>=4.33
|
| 8 |
+
fish-audio-sdk>=0.0.7
|
| 9 |
+
torch>=2.1.0
|
| 10 |
+
torchaudio>=2.1.0
|
|
|
|
|
|
utils/tts.py
CHANGED
|
@@ -1,46 +1,54 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Text-to-Speech Module using YourTTS
|
| 3 |
-
Handles speech synthesis and output generation
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
from TTS.api import TTS
|
| 7 |
-
import os
|
| 8 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
tts
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
-
"assets/reference_voice.wav"
|
| 34 |
-
if os.path.exists("assets/reference_voice.wav")
|
| 35 |
-
else None
|
| 36 |
-
)
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
return output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
+
import yaml
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import torch
|
| 5 |
+
from fish_audio.sdk import TextToSpeech, Vocoder
|
| 6 |
+
from pydub import AudioSegment
|
| 7 |
|
| 8 |
+
# Load config
|
| 9 |
+
config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
|
| 10 |
+
with open(config_path) as f:
|
| 11 |
+
config = yaml.safe_load(f)
|
| 12 |
+
|
| 13 |
+
# Initialize models
|
| 14 |
+
tts_model = TextToSpeech(
|
| 15 |
+
model_name=config["tts"]["model"],
|
| 16 |
+
device=config["tts"]["device"],
|
| 17 |
+
precision=config["tts"]["precision"],
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
vocoder = Vocoder(
|
| 21 |
+
model_name=config["tts"]["vocoder"],
|
| 22 |
+
device=tts_model.device,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
def generate_speech(text: str, language: str = "zh") -> str:
|
| 26 |
+
"""Generate speech from text using Fish Audio SDK"""
|
| 27 |
+
# Format text with language tags
|
| 28 |
+
lang_template = config["generation"]["language_mapping"][language]
|
| 29 |
+
processed_text = lang_template.format(text=text)
|
| 30 |
|
| 31 |
+
# Generate mel spectrogram
|
| 32 |
+
mel = tts_model.generate(
|
| 33 |
+
text=processed_text,
|
| 34 |
+
temperature=config["generation"]["temperature"],
|
| 35 |
+
top_k=config["generation"]["top_k"],
|
| 36 |
+
max_length=config["generation"]["max_length"],
|
| 37 |
)
|
| 38 |
|
| 39 |
+
# Convert mel to waveform
|
| 40 |
+
waveform = vocoder(mel)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
# Create audio segment
|
| 43 |
+
audio = AudioSegment(
|
| 44 |
+
waveform.numpy().tobytes(),
|
| 45 |
+
frame_rate=vocoder.sample_rate,
|
| 46 |
+
sample_width=2,
|
| 47 |
+
channels=1,
|
| 48 |
)
|
| 49 |
|
| 50 |
+
# Save output
|
| 51 |
+
output_path = f"temp/outputs/output_{int(time.time())}.wav"
|
| 52 |
+
audio.export(output_path, format="wav")
|
| 53 |
+
|
| 54 |
return output_path
|