Spaces:

osanseviero
/

tortoisse-tts

Runtime error

App Files Files Community

Omar Sanseviero commited on Apr 28, 2022

Commit

1a8cc73

1 Parent(s): ec81f5a

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -14

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from tortoise_tts.api import TextToSpeech
 from tortoise_tts.utils.audio import load_audio, get_voices
 import torch
 import torchaudio
 import gradio as gr
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -50,8 +51,47 @@ def inference(text, voice):
     print("gen")
     torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
     return "generated.wav"
 text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
 examples = [
@@ -60,18 +100,33 @@ examples = [
     ["how are you doing this day", "freeman"]
 ]
-iface = gr.Interface(
-  inference,
-  inputs=[
-      gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
-      gr.inputs.Dropdown(voices),
-  ],
-  outputs="audio",
-  title="TorToiSe",
-  description="A multi-voice TTS system trained with an emphasis on quality",
-  article="This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",
-  enable_queue=True,
-  examples=examples,
-)
-iface.launch(cache_examples=True)

 from tortoise_tts.utils.audio import load_audio, get_voices
 import torch
 import torchaudio
+import numpy as np
 import gradio as gr
 device = "cuda" if torch.cuda.is_available() else "cpu"
     print("gen")
     torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
     return "generated.wav"
+def load_audio_special(sr, data):
+    if data.dtype == np.int32:
+        norm_fix = 2 ** 31
+    elif data.dtype == np.int16:
+        norm_fix = 2 ** 15
+    elif data.dtype == np.float16 or data.dtype == np.float32:
+        norm_fix = 1.
+    audio = torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
+    # Remove any channel data.
+    if len(audio.shape) > 1:
+        if audio.shape[0] < 5:
+            audio = audio[0]
+        else:
+            assert audio.shape[1] < 5
+            audio = audio[:, 0]
+    if sr != sampling_rate:
+        audio = torchaudio.functional.resample(audio, sr, sampling_rate)
+    # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
+    # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
+    if torch.any(audio > 2) or not torch.any(audio < 0):
+        print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
+    audio.clip_(-1, 1)
+    return audio.unsqueeze(0)
+def inference_own_voice(text, voice_1, voice_2, voice_3):
+    text = text[:256]
+    print(voice_1)
+    conds = [
+        load_audio_special(voice_1),
+        load_audio_special(voice_2),
+        load_audio_special(voice_1_3),
+    ]
+    print(text, conds, preset)
+    gen = tts.tts_with_preset(text, conds, preset)
+    print("gen")
+    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
+    return "generated.wav"
 text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
 examples = [
     ["how are you doing this day", "freeman"]
 ]
+block = gr.Blocks()
+with block:
+    gr.Markdown("# TorToiSe")
+    gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
+    with gr.Tabs():
+        with gr.TabItem("Pre-recorded voices"):
+            iface = gr.Interface(
+                inference,
+                inputs=[
+                    gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
+                    gr.inputs.Dropdown(voices),
+                ],
+                outputs="audio",
+                examples=examples,
+            )
+        with gr.TabItem("Record your voice"):
+            iface = gr.Interface(
+              inference_own_voice,
+              inputs=[
+                  gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
+                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 1)", type="numpy"),
+                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 2)", type="numpy"),
+                  gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 3)", type="numpy"),
+              ],
+              outputs="audio"
+            )
+    gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)
+    block.launch()