|
|
import os |
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
|
|
|
from gradio_client import Client, handle_file |
|
|
from typing import Any, Dict, List |
|
|
from PIL import Image |
|
|
import json |
|
|
|
|
|
|
|
|
asr_client = Client("VeuReu/asr") |
|
|
|
|
|
|
|
|
def extract_audio_from_video(video_path: str) -> str: |
|
|
""" |
|
|
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function uploads a video file to the remote ASR service and extracts its audio track. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
video_path : str |
|
|
Path to the input video file from which audio will be extracted. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Path or identifier of the extracted audio file returned by the remote service. |
|
|
""" |
|
|
result = asr_client.predict( |
|
|
video_file={"video": handle_file(video_path)}, |
|
|
api_name="/extract_audio_ffmpeg" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def diarize_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function performs speaker diarization, identifying segments of speech |
|
|
belonging to different speakers in the audio file. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the audio file to be diarized. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
JSON-like diarization output containing speaker segments and timings. |
|
|
""" |
|
|
result = asr_client.predict( |
|
|
wav_archivo=handle_file(audio_path), |
|
|
api_name="/diaritzar_audio" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def transcribe_long_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
Designed for long audio recordings, this function sends the audio to the ASR model |
|
|
optimized for processing extended durations. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the long audio file to be transcribed. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Transcribed text returned by the remote ASR service. |
|
|
""" |
|
|
result = asr_client.predict( |
|
|
wav_path=handle_file(audio_path), |
|
|
api_name="/transcribe_long_audio" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def transcribe_short_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function is optimized for short-duration audio samples and produces fast transcriptions. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the short audio file to be transcribed. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Transcribed text returned by the remote service. |
|
|
""" |
|
|
result = asr_client.predict( |
|
|
wav_path=handle_file(audio_path), |
|
|
api_name="/transcribe_wav" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]): |
|
|
""" |
|
|
Call the /identificar_veu endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function attempts to identify which known speaker (from a provided |
|
|
collection of voice profiles) appears in the given audio clip. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
clip_path : str |
|
|
Path to the audio clip whose speaker is to be identified. |
|
|
voice_col : List[Dict[str, Any]] |
|
|
List of dictionaries containing metadata or embeddings for known voices. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Any |
|
|
Output returned by the remote speaker identification model. |
|
|
""" |
|
|
voice_col_str = json.dumps(voice_col) |
|
|
result = asr_client.predict( |
|
|
wav_archivo=handle_file(clip_path), |
|
|
voice_col=voice_col_str, |
|
|
api_name="/identificar_veu" |
|
|
) |
|
|
return result |
|
|
|