| import gradio as gr | |
| from transformers import AutoProcessor, BarkModel | |
| import scipy | |
| processor = AutoProcessor.from_pretrained("suno/bark-small") | |
| model = BarkModel.from_pretrained("suno/bark-small") | |
| model = model.to_bettertransformer() | |
| def greet(text): | |
| inputs = processor( | |
| text=[text], | |
| return_tensors="pt", | |
| ) | |
| speech_values = model.generate(**inputs, do_sample=True, use_flash_attention_2=True) | |
| scipy.io.wavfile.write("tmp.wav", rate=24000, data=speech_values.cpu().numpy().squeeze()) | |
| return open("tmp.wav", "rb").read() | |
| iface = gr.Interface(fn=greet, inputs="text", outputs="audio") | |
| iface.launch() |