VeuReu commited on
Commit
391b4d9
·
verified ·
1 Parent(s): b5d35bf

Update svision_client.py

Browse files
Files changed (1) hide show
  1. svision_client.py +249 -249
svision_client.py CHANGED
@@ -1,249 +1,249 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
-
4
- from gradio_client import Client, handle_file
5
- from typing import Any, Dict, List, Optional, Tuple, Union
6
- import requests
7
- import json
8
-
9
- # Lazy initialization to avoid crash if Space is down at import time
10
- _svision_client = None
11
-
12
-
13
- def _get_svision_client():
14
- """Get or create the svision client (lazy initialization)."""
15
- global _svision_client
16
- if _svision_client is None:
17
- _svision_client = Client("VeuReu/svision")
18
- return _svision_client
19
-
20
-
21
- def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
22
- """
23
- Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
24
-
25
- Parameters
26
- ----------
27
- video_path : str
28
- Path to the input video file.
29
- threshold : float, optional
30
- Scene change detection threshold; higher values make detection less sensitive.
31
- offset_frames : int, optional
32
- Number of frames to include before and after a detected scene boundary.
33
- crop_ratio : float, optional
34
- Ratio for cropping borders before performing scene detection.
35
-
36
- Returns
37
- -------
38
- Any
39
- Response returned by the remote /scenes_extraction endpoint.
40
- """
41
- result = _get_svision_client().predict(
42
- video_file={"video": handle_file(video_path)},
43
- threshold=threshold,
44
- offset_frames=offset_frames,
45
- crop_ratio=crop_ratio,
46
- api_name="/scenes_extraction"
47
- )
48
- return result
49
-
50
-
51
- def keyframes_every_second_extraction(video_path: str):
52
- """
53
- Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
54
-
55
- Parameters
56
- ----------
57
- video_path : str
58
- Path to the input video file.
59
-
60
- Returns
61
- -------
62
- Any
63
- Response returned by the remote /keyframes_every_second_extraction endpoint.
64
- """
65
- result = _get_svision_client().predict(
66
- video_path={"video": handle_file(video_path)},
67
- api_name="/keyframes_every_second_extraction"
68
- )
69
- return result
70
-
71
-
72
- def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
73
- """
74
- Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
75
-
76
- This function sends an image together with metadata and face collection data
77
- to perform OCR, face detection, and annotation enhancement.
78
-
79
- Parameters
80
- ----------
81
- imagen_path : str
82
- Path to the input image file.
83
- informacion_image : Dict[str, Any]
84
- Dictionary containing image-related metadata.
85
- face_col : List[Dict[str, Any]]
86
- List of dictionaries representing detected faces or face metadata.
87
-
88
- Returns
89
- -------
90
- Dict[str, Any]
91
- Processed output containing OCR results, face detection data, and annotations.
92
- """
93
- informacion_image_str = json.dumps(informacion_image)
94
- face_col_str = json.dumps(face_col)
95
- result = _get_svision_client().predict(
96
- image=handle_file(imagen_path),
97
- informacion_image=informacion_image_str,
98
- face_col=face_col_str,
99
- api_name="/add_ocr_and_faces"
100
- )
101
- return result
102
-
103
-
104
- def extract_descripcion_escena(imagen_path: str) -> str:
105
- """
106
- Call the /describe_images endpoint of the remote Space VeuReu/svision.
107
-
108
- This function sends an image to receive a textual description of its visual content.
109
-
110
- Parameters
111
- ----------
112
- imagen_path : str
113
- Path to the input image file.
114
-
115
- Returns
116
- -------
117
- str
118
- Description generated for the given image.
119
- """
120
- result = _get_svision_client().predict(
121
- images=[{"image": handle_file(imagen_path)}],
122
- api_name="/describe_images"
123
- )
124
- return result
125
-
126
-
127
- def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
128
- """Extract file path from Gradio file object (can be dict, str, tuple, or other).
129
-
130
- Gradio Gallery returns different formats depending on version:
131
- - List of tuples: [(path, caption), ...]
132
- - List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
133
- - List of FileData: [FileData(path=..., url=...), ...]
134
- - List of paths: [path, ...]
135
- """
136
- if file_obj is None:
137
- return None
138
-
139
- # Handle tuple format: (path, caption)
140
- if isinstance(file_obj, tuple) and len(file_obj) >= 1:
141
- return _extract_path_from_gradio_file(file_obj[0])
142
-
143
- # Handle string path/URL
144
- if isinstance(file_obj, str):
145
- return file_obj
146
-
147
- # Handle dict format: {"path": "...", "url": "...", "name": "..."}
148
- if isinstance(file_obj, dict):
149
- return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
150
-
151
- # Handle FileData or similar object with attributes
152
- if hasattr(file_obj, "path") and file_obj.path:
153
- return file_obj.path
154
- if hasattr(file_obj, "url") and file_obj.url:
155
- return file_obj.url
156
- if hasattr(file_obj, "name") and file_obj.name:
157
- return file_obj.name
158
-
159
- # Last resort: convert to string
160
- return str(file_obj) if file_obj else None
161
-
162
-
163
- def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
164
- """
165
- Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
166
-
167
- This replaces local DeepFace/face_recognition processing by delegating to svision Space.
168
-
169
- Parameters
170
- ----------
171
- image_path : str
172
- Path to the input image file (a video frame).
173
-
174
- Returns
175
- -------
176
- List[Dict[str, Any]]
177
- List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
178
- Returns empty list if no faces detected or on error.
179
- """
180
- try:
181
- # Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts)
182
- result = _get_svision_client().predict(
183
- image=handle_file(image_path),
184
- api_name="/face_image_embedding_casting"
185
- )
186
-
187
- print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
188
-
189
- # result is a tuple: (list of image paths/dicts, list of embedding dicts)
190
- if result and len(result) >= 2:
191
- face_crops_raw = result[0] if result[0] else []
192
- face_embeddings = result[1] if result[1] else []
193
-
194
- print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
195
- if face_crops_raw and len(face_crops_raw) > 0:
196
- print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
197
-
198
- # Combine into unified structure, extracting paths correctly
199
- faces = []
200
- for i, emb_dict in enumerate(face_embeddings):
201
- # Extract path from Gradio file object (might be dict or string)
202
- crop_path = None
203
- if i < len(face_crops_raw):
204
- raw_crop = face_crops_raw[i]
205
- crop_path = _extract_path_from_gradio_file(raw_crop)
206
- if not crop_path:
207
- print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
208
-
209
- embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
210
-
211
- faces.append({
212
- "embedding": embedding,
213
- "face_crop_path": crop_path,
214
- "index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
215
- })
216
-
217
- print(f"[svision_client] Detected {len(faces)} faces from image")
218
- return faces
219
- return []
220
- except Exception as e:
221
- print(f"[svision_client] get_face_embeddings_from_image error: {e}")
222
- import traceback
223
- traceback.print_exc()
224
- return []
225
-
226
-
227
- def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
228
- """
229
- Call the /face_image_embedding endpoint to get face embeddings only.
230
-
231
- Parameters
232
- ----------
233
- image_path : str
234
- Path to the input image file.
235
-
236
- Returns
237
- -------
238
- List[List[float]]
239
- List of embedding vectors (one per detected face).
240
- """
241
- try:
242
- result = _get_svision_client().predict(
243
- image=handle_file(image_path),
244
- api_name="/face_image_embedding"
245
- )
246
- return result if result else []
247
- except Exception as e:
248
- print(f"[svision_client] get_face_embeddings_simple error: {e}")
249
- return []
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
+
4
+ from gradio_client import Client, handle_file
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+ import requests
7
+ import json
8
+
9
+ # Lazy initialization to avoid crash if Space is down at import time
10
+ _svision_client = None
11
+
12
+
13
+ def _get_svision_client():
14
+ """Get or create the svision client (lazy initialization)."""
15
+ global _svision_client
16
+ if _svision_client is None:
17
+ _svision_client = Client("VeuReu/svision")
18
+ return _svision_client
19
+
20
+
21
+ def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1):
22
+ """
23
+ Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
24
+
25
+ Parameters
26
+ ----------
27
+ video_path : str
28
+ Path to the input video file.
29
+ threshold : float, optional
30
+ Scene change detection threshold; higher values make detection less sensitive.
31
+ offset_frames : int, optional
32
+ Number of frames to include before and after a detected scene boundary.
33
+ crop_ratio : float, optional
34
+ Ratio for cropping borders before performing scene detection.
35
+
36
+ Returns
37
+ -------
38
+ Any
39
+ Response returned by the remote /scenes_extraction endpoint.
40
+ """
41
+ result = _get_svision_client().predict(
42
+ video_file={"video": handle_file(video_path)},
43
+ threshold=threshold,
44
+ offset_frames=offset_frames,
45
+ crop_ratio=crop_ratio,
46
+ api_name="/scenes_extraction"
47
+ )
48
+ return result
49
+
50
+
51
+ def keyframes_every_second_extraction(video_path: str):
52
+ """
53
+ Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
54
+
55
+ Parameters
56
+ ----------
57
+ video_path : str
58
+ Path to the input video file.
59
+
60
+ Returns
61
+ -------
62
+ Any
63
+ Response returned by the remote /keyframes_every_second_extraction endpoint.
64
+ """
65
+ result = _get_svision_client().predict(
66
+ video_path={"video": handle_file(video_path)},
67
+ api_name="/keyframes_every_second_extraction"
68
+ )
69
+ return result
70
+
71
+
72
+ def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
73
+ """
74
+ Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
75
+
76
+ This function sends an image together with metadata and face collection data
77
+ to perform OCR, face detection, and annotation enhancement.
78
+
79
+ Parameters
80
+ ----------
81
+ imagen_path : str
82
+ Path to the input image file.
83
+ informacion_image : Dict[str, Any]
84
+ Dictionary containing image-related metadata.
85
+ face_col : List[Dict[str, Any]]
86
+ List of dictionaries representing detected faces or face metadata.
87
+
88
+ Returns
89
+ -------
90
+ Dict[str, Any]
91
+ Processed output containing OCR results, face detection data, and annotations.
92
+ """
93
+ informacion_image_str = json.dumps(informacion_image)
94
+ face_col_str = json.dumps(face_col)
95
+ result = _get_svision_client().predict(
96
+ image=handle_file(imagen_path),
97
+ informacion_image=informacion_image_str,
98
+ face_col=face_col_str,
99
+ api_name="/add_ocr_and_faces"
100
+ )
101
+ return result
102
+
103
+
104
+ def extract_descripcion_escena(imagen_path: str) -> str:
105
+ """
106
+ Call the /describe_images endpoint of the remote Space VeuReu/svision.
107
+
108
+ This function sends an image to receive a textual description of its visual content.
109
+
110
+ Parameters
111
+ ----------
112
+ imagen_path : str
113
+ Path to the input image file.
114
+
115
+ Returns
116
+ -------
117
+ str
118
+ Description generated for the given image.
119
+ """
120
+ result = _get_svision_client().predict(
121
+ images=[{"image": handle_file(imagen_path)}],
122
+ api_name="/describe_images"
123
+ )
124
+ return result
125
+
126
+
127
+ def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
128
+ """Extract file path from Gradio file object (can be dict, str, tuple, or other).
129
+
130
+ Gradio Gallery returns different formats depending on version:
131
+ - List of tuples: [(path, caption), ...]
132
+ - List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
133
+ - List of FileData: [FileData(path=..., url=...), ...]
134
+ - List of paths: [path, ...]
135
+ """
136
+ if file_obj is None:
137
+ return None
138
+
139
+ # Handle tuple format: (path, caption)
140
+ if isinstance(file_obj, tuple) and len(file_obj) >= 1:
141
+ return _extract_path_from_gradio_file(file_obj[0])
142
+
143
+ # Handle string path/URL
144
+ if isinstance(file_obj, str):
145
+ return file_obj
146
+
147
+ # Handle dict format: {"path": "...", "url": "...", "name": "..."}
148
+ if isinstance(file_obj, dict):
149
+ return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
150
+
151
+ # Handle FileData or similar object with attributes
152
+ if hasattr(file_obj, "path") and file_obj.path:
153
+ return file_obj.path
154
+ if hasattr(file_obj, "url") and file_obj.url:
155
+ return file_obj.url
156
+ if hasattr(file_obj, "name") and file_obj.name:
157
+ return file_obj.name
158
+
159
+ # Last resort: convert to string
160
+ return str(file_obj) if file_obj else None
161
+
162
+
163
+ def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
164
+ """
165
+ Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
166
+
167
+ This replaces local DeepFace/face_recognition processing by delegating to svision Space.
168
+
169
+ Parameters
170
+ ----------
171
+ image_path : str
172
+ Path to the input image file (a video frame).
173
+
174
+ Returns
175
+ -------
176
+ List[Dict[str, Any]]
177
+ List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
178
+ Returns empty list if no faces detected or on error.
179
+ """
180
+ try:
181
+ # Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts)
182
+ result = _get_svision_client().predict(
183
+ image=handle_file(image_path),
184
+ api_name="/face_image_embedding_casting"
185
+ )
186
+
187
+ print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
188
+
189
+ # result is a tuple: (list of image paths/dicts, list of embedding dicts)
190
+ if result and len(result) >= 2:
191
+ face_crops_raw = result[0] if result[0] else []
192
+ face_embeddings = result[1] if result[1] else []
193
+
194
+ print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
195
+ if face_crops_raw and len(face_crops_raw) > 0:
196
+ print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
197
+
198
+ # Combine into unified structure, extracting paths correctly
199
+ faces = []
200
+ for i, emb_dict in enumerate(face_embeddings):
201
+ # Extract path from Gradio file object (might be dict or string)
202
+ crop_path = None
203
+ if i < len(face_crops_raw):
204
+ raw_crop = face_crops_raw[i]
205
+ crop_path = _extract_path_from_gradio_file(raw_crop)
206
+ if not crop_path:
207
+ print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
208
+
209
+ embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
210
+
211
+ faces.append({
212
+ "embedding": embedding,
213
+ "face_crop_path": crop_path,
214
+ "index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
215
+ })
216
+
217
+ print(f"[svision_client] Detected {len(faces)} faces from image")
218
+ return faces
219
+ return []
220
+ except Exception as e:
221
+ print(f"[svision_client] get_face_embeddings_from_image error: {e}")
222
+ import traceback
223
+ traceback.print_exc()
224
+ return []
225
+
226
+
227
+ def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
228
+ """
229
+ Call the /face_image_embedding endpoint to get face embeddings only.
230
+
231
+ Parameters
232
+ ----------
233
+ image_path : str
234
+ Path to the input image file.
235
+
236
+ Returns
237
+ -------
238
+ List[List[float]]
239
+ List of embedding vectors (one per detected face).
240
+ """
241
+ try:
242
+ result = _get_svision_client().predict(
243
+ image=handle_file(image_path),
244
+ api_name="/face_image_embedding"
245
+ )
246
+ return result if result else []
247
+ except Exception as e:
248
+ print(f"[svision_client] get_face_embeddings_simple error: {e}")
249
+ return []