import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" from gradio_client import Client, handle_file from typing import Any, Dict, List from PIL import Image import json # Lazy initialization to avoid crash if Space is down at import time _asr_client = None def _get_asr_client(): """Get or create the ASR client (lazy initialization).""" global _asr_client if _asr_client is None: _asr_client = Client("VeuReu/asr") return _asr_client def extract_audio_from_video(video_path: str) -> str: """ Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space. This function uploads a video file to the remote ASR service and extracts its audio track. Parameters ---------- video_path : str Path to the input video file from which audio will be extracted. Returns ------- str Path or identifier of the extracted audio file returned by the remote service. """ result = _get_asr_client().predict( video_file={"video": handle_file(video_path)}, api_name="/extract_audio_ffmpeg" ) return result def diarize_audio(audio_path: str) -> str: """ Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space. This function performs speaker diarization, identifying segments of speech belonging to different speakers in the audio file. Parameters ---------- audio_path : str Path to the audio file to be diarized. Returns ------- str JSON-like diarization output containing speaker segments and timings. """ result = _get_asr_client().predict( wav_file=handle_file(audio_path), api_name="/diaritzar_audio" ) return result def transcribe_long_audio(audio_path: str) -> str: """ Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space. Designed for long audio recordings, this function sends the audio to the ASR model optimized for processing extended durations. Parameters ---------- audio_path : str Path to the long audio file to be transcribed. Returns ------- str Transcribed text returned by the remote ASR service. """ result = _get_asr_client().predict( wav_path=handle_file(audio_path), api_name="/transcribe_long_audio" ) return result def transcribe_short_audio(audio_path: str) -> str: """ Call the /transcribe_wav endpoint of the remote VeuReu/asr Space. This function is optimized for short-duration audio samples and produces fast transcriptions. Parameters ---------- audio_path : str Path to the short audio file to be transcribed. Returns ------- str Transcribed text returned by the remote service. """ result = _get_asr_client().predict( wav_path=handle_file(audio_path), api_name="/transcribe_wav" ) return result def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]): """ Call the /identificar_veu endpoint of the remote VeuReu/asr Space. This function attempts to identify which known speaker (from a provided collection of voice profiles) appears in the given audio clip. Parameters ---------- clip_path : str Path to the audio clip whose speaker is to be identified. voice_col : List[Dict[str, Any]] List of dictionaries containing metadata or embeddings for known voices. Returns ------- Any Output returned by the remote speaker identification model. """ voice_col_str = json.dumps(voice_col) result = _get_asr_client().predict( wav_file=handle_file(clip_path), voice_col=voice_col_str, api_name="/identificar_veu" ) return result def get_voice_embedding(audio_path: str) -> List[float]: """ Call the /voice_embedding endpoint to get a voice embedding vector. This replaces local SpeakerRecognition processing by delegating to asr Space. Parameters ---------- audio_path : str Path to the audio file (WAV format preferred). Returns ------- List[float] Normalized embedding vector for the voice, or empty list on error. """ try: result = _get_asr_client().predict( wav_file=handle_file(audio_path), api_name="/voice_embedding" ) return result if result else [] except Exception as e: print(f"[asr_client] get_voice_embedding error: {e}") return [] def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]: """ Extract audio from video and perform diarization in one call. Parameters ---------- video_path : str Path to the input video file. Returns ------- Dict[str, Any] Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info). """ try: # First extract audio audio_path = extract_audio_from_video(video_path) if not audio_path: return {"clips": [], "segments": [], "error": "Audio extraction failed"} # Then diarize result = diarize_audio(audio_path) # result is tuple: (clips_paths, segments) if result and len(result) >= 2: return { "clips": result[0] if result[0] else [], "segments": result[1] if result[1] else [], "audio_path": audio_path, } return {"clips": [], "segments": [], "audio_path": audio_path} except Exception as e: print(f"[asr_client] extract_audio_and_diarize error: {e}") return {"clips": [], "segments": [], "error": str(e)}