import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" from gradio_client import Client, handle_file from typing import Any, Dict, List, Optional, Tuple, Union import requests import json # Lazy initialization to avoid crash if Space is down at import time _svision_client = None def _get_svision_client(): """Get or create the svision client (lazy initialization).""" global _svision_client if _svision_client is None: _svision_client = Client("VeuReu/svision") return _svision_client def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1): """ Call the /scenes_extraction endpoint of the remote Space VeuReu/svision. Parameters ---------- video_path : str Path to the input video file. threshold : float, optional Scene change detection threshold; higher values make detection less sensitive. offset_frames : int, optional Number of frames to include before and after a detected scene boundary. crop_ratio : float, optional Ratio for cropping borders before performing scene detection. Returns ------- Any Response returned by the remote /scenes_extraction endpoint. """ result = _get_svision_client().predict( video_file={"video": handle_file(video_path)}, threshold=threshold, offset_frames=offset_frames, crop_ratio=crop_ratio, api_name="/scenes_extraction" ) return result def keyframes_every_second_extraction(video_path: str): """ Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision. Parameters ---------- video_path : str Path to the input video file. Returns ------- Any Response returned by the remote /keyframes_every_second_extraction endpoint. """ result = _get_svision_client().predict( video_path={"video": handle_file(video_path)}, api_name="/keyframes_every_second_extraction" ) return result def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]: """ Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision. This function sends an image together with metadata and face collection data to perform OCR, face detection, and annotation enhancement. Parameters ---------- imagen_path : str Path to the input image file. informacion_image : Dict[str, Any] Dictionary containing image-related metadata. face_col : List[Dict[str, Any]] List of dictionaries representing detected faces or face metadata. Returns ------- Dict[str, Any] Processed output containing OCR results, face detection data, and annotations. """ informacion_image_str = json.dumps(informacion_image) face_col_str = json.dumps(face_col) result = _get_svision_client().predict( image=handle_file(imagen_path), informacion_image=informacion_image_str, face_col=face_col_str, api_name="/add_ocr_and_faces" ) return result def extract_descripcion_escena(imagen_path: str) -> str: """ Call the /describe_images endpoint of the remote Space VeuReu/svision. This function sends an image to receive a textual description of its visual content. Parameters ---------- imagen_path : str Path to the input image file. Returns ------- str Description generated for the given image. """ result = _get_svision_client().predict( images=[{"image": handle_file(imagen_path)}], api_name="/describe_images" ) return result def _extract_path_from_gradio_file(file_obj) -> Optional[str]: """Extract file path from Gradio file object (can be dict, str, tuple, or other). Gradio Gallery returns different formats depending on version: - List of tuples: [(path, caption), ...] - List of dicts: [{"name": path, "data": None, "is_file": True}, ...] - List of FileData: [FileData(path=..., url=...), ...] - List of paths: [path, ...] """ if file_obj is None: return None # Handle tuple format: (path, caption) if isinstance(file_obj, tuple) and len(file_obj) >= 1: return _extract_path_from_gradio_file(file_obj[0]) # Handle string path/URL if isinstance(file_obj, str): return file_obj # Handle dict format: {"path": "...", "url": "...", "name": "..."} if isinstance(file_obj, dict): return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image") # Handle FileData or similar object with attributes if hasattr(file_obj, "path") and file_obj.path: return file_obj.path if hasattr(file_obj, "url") and file_obj.url: return file_obj.url if hasattr(file_obj, "name") and file_obj.name: return file_obj.name # Last resort: convert to string return str(file_obj) if file_obj else None def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]: """ Call the /face_image_embedding_casting endpoint to detect faces and get embeddings. This replaces local DeepFace/face_recognition processing by delegating to svision Space. Parameters ---------- image_path : str Path to the input image file (a video frame). Returns ------- List[Dict[str, Any]] List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string). Returns empty list if no faces detected or on error. """ try: # Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts) result = _get_svision_client().predict( image=handle_file(image_path), api_name="/face_image_embedding_casting" ) print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}") # result is a tuple: (list of image paths/dicts, list of embedding dicts) if result and len(result) >= 2: face_crops_raw = result[0] if result[0] else [] face_embeddings = result[1] if result[1] else [] print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}") if face_crops_raw and len(face_crops_raw) > 0: print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}") # Combine into unified structure, extracting paths correctly faces = [] for i, emb_dict in enumerate(face_embeddings): # Extract path from Gradio file object (might be dict or string) crop_path = None if i < len(face_crops_raw): raw_crop = face_crops_raw[i] crop_path = _extract_path_from_gradio_file(raw_crop) if not crop_path: print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}") embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else [] faces.append({ "embedding": embedding, "face_crop_path": crop_path, "index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i, }) print(f"[svision_client] Detected {len(faces)} faces from image") return faces return [] except Exception as e: print(f"[svision_client] get_face_embeddings_from_image error: {e}") import traceback traceback.print_exc() return [] def get_face_embeddings_simple(image_path: str) -> List[List[float]]: """ Call the /face_image_embedding endpoint to get face embeddings only. Parameters ---------- image_path : str Path to the input image file. Returns ------- List[List[float]] List of embedding vectors (one per detected face). """ try: result = _get_svision_client().predict( image=handle_file(image_path), api_name="/face_image_embedding" ) return result if result else [] except Exception as e: print(f"[svision_client] get_face_embeddings_simple error: {e}") return []