Spaces:
Runtime error
Runtime error
Omar Sanseviero
commited on
Commit
·
1a8cc73
1
Parent(s):
ec81f5a
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ from tortoise_tts.api import TextToSpeech
|
|
| 12 |
from tortoise_tts.utils.audio import load_audio, get_voices
|
| 13 |
import torch
|
| 14 |
import torchaudio
|
|
|
|
| 15 |
import gradio as gr
|
| 16 |
|
| 17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -50,8 +51,47 @@ def inference(text, voice):
|
|
| 50 |
print("gen")
|
| 51 |
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
|
| 52 |
return "generated.wav"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
|
| 57 |
examples = [
|
|
@@ -60,18 +100,33 @@ examples = [
|
|
| 60 |
["how are you doing this day", "freeman"]
|
| 61 |
]
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
|
|
|
| 12 |
from tortoise_tts.utils.audio import load_audio, get_voices
|
| 13 |
import torch
|
| 14 |
import torchaudio
|
| 15 |
+
import numpy as np
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 51 |
print("gen")
|
| 52 |
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
|
| 53 |
return "generated.wav"
|
| 54 |
+
|
| 55 |
+
def load_audio_special(sr, data):
|
| 56 |
+
if data.dtype == np.int32:
|
| 57 |
+
norm_fix = 2 ** 31
|
| 58 |
+
elif data.dtype == np.int16:
|
| 59 |
+
norm_fix = 2 ** 15
|
| 60 |
+
elif data.dtype == np.float16 or data.dtype == np.float32:
|
| 61 |
+
norm_fix = 1.
|
| 62 |
+
audio = torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
|
| 63 |
|
| 64 |
+
# Remove any channel data.
|
| 65 |
+
if len(audio.shape) > 1:
|
| 66 |
+
if audio.shape[0] < 5:
|
| 67 |
+
audio = audio[0]
|
| 68 |
+
else:
|
| 69 |
+
assert audio.shape[1] < 5
|
| 70 |
+
audio = audio[:, 0]
|
| 71 |
+
|
| 72 |
+
if sr != sampling_rate:
|
| 73 |
+
audio = torchaudio.functional.resample(audio, sr, sampling_rate)
|
| 74 |
|
| 75 |
+
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
| 76 |
+
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
| 77 |
+
if torch.any(audio > 2) or not torch.any(audio < 0):
|
| 78 |
+
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
|
| 79 |
+
audio.clip_(-1, 1)
|
| 80 |
+
return audio.unsqueeze(0)
|
| 81 |
+
|
| 82 |
+
def inference_own_voice(text, voice_1, voice_2, voice_3):
|
| 83 |
+
text = text[:256]
|
| 84 |
+
print(voice_1)
|
| 85 |
+
conds = [
|
| 86 |
+
load_audio_special(voice_1),
|
| 87 |
+
load_audio_special(voice_2),
|
| 88 |
+
load_audio_special(voice_1_3),
|
| 89 |
+
]
|
| 90 |
+
print(text, conds, preset)
|
| 91 |
+
gen = tts.tts_with_preset(text, conds, preset)
|
| 92 |
+
print("gen")
|
| 93 |
+
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
|
| 94 |
+
return "generated.wav"
|
| 95 |
|
| 96 |
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
|
| 97 |
examples = [
|
|
|
|
| 100 |
["how are you doing this day", "freeman"]
|
| 101 |
]
|
| 102 |
|
| 103 |
+
block = gr.Blocks()
|
| 104 |
+
with block:
|
| 105 |
+
gr.Markdown("# TorToiSe")
|
| 106 |
+
gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
|
| 107 |
+
with gr.Tabs():
|
| 108 |
+
with gr.TabItem("Pre-recorded voices"):
|
| 109 |
+
iface = gr.Interface(
|
| 110 |
+
inference,
|
| 111 |
+
inputs=[
|
| 112 |
+
gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
|
| 113 |
+
gr.inputs.Dropdown(voices),
|
| 114 |
+
],
|
| 115 |
+
outputs="audio",
|
| 116 |
+
examples=examples,
|
| 117 |
+
)
|
| 118 |
+
with gr.TabItem("Record your voice"):
|
| 119 |
+
iface = gr.Interface(
|
| 120 |
+
inference_own_voice,
|
| 121 |
+
inputs=[
|
| 122 |
+
gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
|
| 123 |
+
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 1)", type="numpy"),
|
| 124 |
+
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 2)", type="numpy"),
|
| 125 |
+
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 3)", type="numpy"),
|
| 126 |
+
],
|
| 127 |
+
outputs="audio"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)
|
| 131 |
|
| 132 |
+
block.launch()
|