File size: 3,536 Bytes
7153ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from gradio_client import Client, handle_file
from typing import Any, Dict, List
from PIL import Image
import json

# Connect to the remote Space
asr_client = Client("VeuReu/asr")


def extract_audio_from_video(video_path: str) -> str:
    """
    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.

    This function uploads a video file to the remote ASR service and extracts its audio track.

    Parameters
    ----------
    video_path : str
        Path to the input video file from which audio will be extracted.

    Returns
    -------
    str
        Path or identifier of the extracted audio file returned by the remote service.
    """
    result = asr_client.predict(
        video_file={"video": handle_file(video_path)},
        api_name="/extract_audio_ffmpeg"
    )
    return result


def diarize_audio(audio_path: str) -> str:
    """
    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.

    This function performs speaker diarization, identifying segments of speech
    belonging to different speakers in the audio file.

    Parameters
    ----------
    audio_path : str
        Path to the audio file to be diarized.

    Returns
    -------
    str
        JSON-like diarization output containing speaker segments and timings.
    """
    result = asr_client.predict(
        wav_archivo=handle_file(audio_path),
        api_name="/diaritzar_audio"
    )
    return result


def transcribe_long_audio(audio_path: str) -> str:
    """
    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.

    Designed for long audio recordings, this function sends the audio to the ASR model
    optimized for processing extended durations.

    Parameters
    ----------
    audio_path : str
        Path to the long audio file to be transcribed.

    Returns
    -------
    str
        Transcribed text returned by the remote ASR service.
    """
    result = asr_client.predict(
        wav_path=handle_file(audio_path),
        api_name="/transcribe_long_audio"
    )
    return result


def transcribe_short_audio(audio_path: str) -> str:
    """
    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.

    This function is optimized for short-duration audio samples and produces fast transcriptions.

    Parameters
    ----------
    audio_path : str
        Path to the short audio file to be transcribed.

    Returns
    -------
    str
        Transcribed text returned by the remote service.
    """
    result = asr_client.predict(
        wav_path=handle_file(audio_path),
        api_name="/transcribe_wav"
    )
    return result


def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
    """
    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.

    This function attempts to identify which known speaker (from a provided
    collection of voice profiles) appears in the given audio clip.

    Parameters
    ----------
    clip_path : str
        Path to the audio clip whose speaker is to be identified.
    voice_col : List[Dict[str, Any]]
        List of dictionaries containing metadata or embeddings for known voices.

    Returns
    -------
    Any
        Output returned by the remote speaker identification model.
    """
    voice_col_str = json.dumps(voice_col)
    result = asr_client.predict(
        wav_archivo=handle_file(clip_path),
        voice_col=voice_col_str,
        api_name="/identificar_veu"
    )
    return result