File size: 8,638 Bytes
391b4d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from gradio_client import Client, handle_file
from typing import Any, Dict, List, Optional, Tuple, Union
import requests
import json

# Lazy initialization to avoid crash if Space is down at import time
_svision_client = None


def _get_svision_client():
    """Get or create the svision client (lazy initialization)."""
    global _svision_client
    if _svision_client is None:
        _svision_client = Client("VeuReu/svision")
    return _svision_client


def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1):
    """
    Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.

    Parameters
    ----------
    video_path : str
        Path to the input video file.
    threshold : float, optional
        Scene change detection threshold; higher values make detection less sensitive.
    offset_frames : int, optional
        Number of frames to include before and after a detected scene boundary.
    crop_ratio : float, optional
        Ratio for cropping borders before performing scene detection.

    Returns
    -------
    Any
        Response returned by the remote /scenes_extraction endpoint.
    """
    result = _get_svision_client().predict(
        video_file={"video": handle_file(video_path)},
        threshold=threshold,
        offset_frames=offset_frames,
        crop_ratio=crop_ratio,
        api_name="/scenes_extraction"
    )
    return result  


def keyframes_every_second_extraction(video_path: str):
    """
    Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.

    Parameters
    ----------
    video_path : str
        Path to the input video file.

    Returns
    -------
    Any
        Response returned by the remote /keyframes_every_second_extraction endpoint.
    """
    result = _get_svision_client().predict(
        video_path={"video": handle_file(video_path)},
        api_name="/keyframes_every_second_extraction"
    )
    return result


def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.

    This function sends an image together with metadata and face collection data
    to perform OCR, face detection, and annotation enhancement.

    Parameters
    ----------
    imagen_path : str
        Path to the input image file.
    informacion_image : Dict[str, Any]
        Dictionary containing image-related metadata.
    face_col : List[Dict[str, Any]]
        List of dictionaries representing detected faces or face metadata.

    Returns
    -------
    Dict[str, Any]
        Processed output containing OCR results, face detection data, and annotations.
    """
    informacion_image_str = json.dumps(informacion_image)
    face_col_str = json.dumps(face_col)
    result = _get_svision_client().predict(
            image=handle_file(imagen_path),
            informacion_image=informacion_image_str,
            face_col=face_col_str,
            api_name="/add_ocr_and_faces"
    )
    return result


def extract_descripcion_escena(imagen_path: str) -> str:
    """
    Call the /describe_images endpoint of the remote Space VeuReu/svision.

    This function sends an image to receive a textual description of its visual content.

    Parameters
    ----------
    imagen_path : str
        Path to the input image file.

    Returns
    -------
    str
        Description generated for the given image.
    """
    result = _get_svision_client().predict(
        images=[{"image": handle_file(imagen_path)}],
        api_name="/describe_images"
    )
    return result


def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
    """Extract file path from Gradio file object (can be dict, str, tuple, or other).
    
    Gradio Gallery returns different formats depending on version:
    - List of tuples: [(path, caption), ...]
    - List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
    - List of FileData: [FileData(path=..., url=...), ...]
    - List of paths: [path, ...]
    """
    if file_obj is None:
        return None
    
    # Handle tuple format: (path, caption)
    if isinstance(file_obj, tuple) and len(file_obj) >= 1:
        return _extract_path_from_gradio_file(file_obj[0])
    
    # Handle string path/URL
    if isinstance(file_obj, str):
        return file_obj
    
    # Handle dict format: {"path": "...", "url": "...", "name": "..."}
    if isinstance(file_obj, dict):
        return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
    
    # Handle FileData or similar object with attributes
    if hasattr(file_obj, "path") and file_obj.path:
        return file_obj.path
    if hasattr(file_obj, "url") and file_obj.url:
        return file_obj.url
    if hasattr(file_obj, "name") and file_obj.name:
        return file_obj.name
    
    # Last resort: convert to string
    return str(file_obj) if file_obj else None


def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
    """
    Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.

    This replaces local DeepFace/face_recognition processing by delegating to svision Space.

    Parameters
    ----------
    image_path : str
        Path to the input image file (a video frame).

    Returns
    -------
    List[Dict[str, Any]]
        List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
        Returns empty list if no faces detected or on error.
    """
    try:
        # Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts)
        result = _get_svision_client().predict(
            image=handle_file(image_path),
            api_name="/face_image_embedding_casting"
        )
        
        print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
        
        # result is a tuple: (list of image paths/dicts, list of embedding dicts)
        if result and len(result) >= 2:
            face_crops_raw = result[0] if result[0] else []
            face_embeddings = result[1] if result[1] else []
            
            print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
            if face_crops_raw and len(face_crops_raw) > 0:
                print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
            
            # Combine into unified structure, extracting paths correctly
            faces = []
            for i, emb_dict in enumerate(face_embeddings):
                # Extract path from Gradio file object (might be dict or string)
                crop_path = None
                if i < len(face_crops_raw):
                    raw_crop = face_crops_raw[i]
                    crop_path = _extract_path_from_gradio_file(raw_crop)
                    if not crop_path:
                        print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
                
                embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
                
                faces.append({
                    "embedding": embedding,
                    "face_crop_path": crop_path,
                    "index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
                })
            
            print(f"[svision_client] Detected {len(faces)} faces from image")
            return faces
        return []
    except Exception as e:
        print(f"[svision_client] get_face_embeddings_from_image error: {e}")
        import traceback
        traceback.print_exc()
        return []


def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
    """
    Call the /face_image_embedding endpoint to get face embeddings only.

    Parameters
    ----------
    image_path : str
        Path to the input image file.

    Returns
    -------
    List[List[float]]
        List of embedding vectors (one per detected face).
    """
    try:
        result = _get_svision_client().predict(
            image=handle_file(image_path),
            api_name="/face_image_embedding"
        )
        return result if result else []
    except Exception as e:
        print(f"[svision_client] get_face_embeddings_simple error: {e}")
        return []