--- a/depth_anything_3/app/modules/model_inference.py
+++ b/depth_anything_3/app/modules/model_inference.py
@@ -31,47 +31,67 @@ from depth_anything_3.utils.export.glb import export_to_glb
 from depth_anything_3.utils.export.gs import export_to_gs_video
 
 
+# Global cache for model (used in GPU subprocess)
+# This is safe because @spaces.GPU runs in isolated subprocess
+_MODEL_CACHE = None
+
+
 class ModelInference:
     """
     Handles model inference and data processing for Depth Anything 3.
     """
 
     def __init__(self):
-        """Initialize the model inference handler."""
-        self.model = None
-
-    def initialize_model(self, device: str = "cuda") -> None:
+        """Initialize the model inference handler.
+        
+        Note: Do NOT store model in instance variable to avoid
+        state sharing issues with @spaces.GPU decorator.
+        """
+        pass  # No instance variables
+
+    def initialize_model(self, device: str = "cuda"):
         """
         Initialize the DepthAnything3 model.
+        
+        Uses global cache to store model safely in GPU subprocess.
+        This avoids CUDA initialization in main process.
 
         Args:
             device: Device to load the model on
+            
+        Returns:
+            Model instance
         """
-        if self.model is None:
+        global _MODEL_CACHE
+        
+        if _MODEL_CACHE is None:
             # Get model directory from environment variable or use default
             model_dir = os.environ.get(
                 "DA3_MODEL_DIR", "/dev/shm/da3_models/DA3HF-VITG-METRIC_VITL"
             )
-            self.model = DepthAnything3.from_pretrained(model_dir)
-            self.model = self.model.to(device)
+            print(f"Loading model from {model_dir}...")
+            _MODEL_CACHE = DepthAnything3.from_pretrained(model_dir)
+            _MODEL_CACHE = _MODEL_CACHE.to(device)
+            _MODEL_CACHE.eval()
+            print("Model loaded and moved to GPU")
         else:
-            self.model = self.model.to(device)
-
-        self.model.eval()
+            print("Using cached model")
+            # Ensure model is on correct device
+            _MODEL_CACHE = _MODEL_CACHE.to(device)
+        
+        return _MODEL_CACHE
 
     def run_inference(
         self,
         ...
         # Initialize model if needed
-        self.initialize_model(device)
+        model = self.initialize_model(device)
 
         ...
 
         # Run model inference
         print(f"Running inference with method: {actual_method}")
         with torch.no_grad():
-            prediction = self.model.inference(
+            prediction = model.inference(
                 image_paths, export_dir=None, process_res_method=actual_method, infer_gs=infer_gs
             )
         
@@ -192,6 +212,10 @@ class ModelInference:
         # Process results
         processed_data = self._process_results(target_dir, prediction, image_paths)
 
+        # CRITICAL: Move all CUDA tensors to CPU before returning
+        # This prevents CUDA initialization in main process during unpickling
+        prediction = self._move_prediction_to_cpu(prediction)
+
         # Clean up
         torch.cuda.empty_cache()
 
@@ -282,6 +306,45 @@ class ModelInference:
 
         return processed_data
 
+    def _move_prediction_to_cpu(self, prediction: Any) -> Any:
+        """
+        Move all CUDA tensors in prediction to CPU for safe pickling.
+        
+        This is REQUIRED for HF Spaces with @spaces.GPU decorator to avoid
+        CUDA initialization in the main process during unpickling.
+        
+        Args:
+            prediction: Prediction object that may contain CUDA tensors
+            
+        Returns:
+            Prediction object with all tensors moved to CPU
+        """
+        # Move gaussians tensors to CPU
+        if hasattr(prediction, 'gaussians') and prediction.gaussians is not None:
+            gaussians = prediction.gaussians
+            
+            # Move each tensor attribute to CPU
+            tensor_attrs = ['means', 'scales', 'rotations', 'harmonics', 'opacities']
+            for attr in tensor_attrs:
+                if hasattr(gaussians, attr):
+                    tensor = getattr(gaussians, attr)
+                    if isinstance(tensor, torch.Tensor) and tensor.is_cuda:
+                        setattr(gaussians, attr, tensor.cpu())
+                        print(f"Moved gaussians.{attr} to CPU")
+        
+        # Move any tensors in aux dict to CPU
+        if hasattr(prediction, 'aux') and prediction.aux is not None:
+            for key, value in list(prediction.aux.items()):
+                if isinstance(value, torch.Tensor) and value.is_cuda:
+                    prediction.aux[key] = value.cpu()
+                    print(f"Moved aux['{key}'] to CPU")
+                elif isinstance(value, dict):
+                    # Recursively handle nested dicts
+                    for k, v in list(value.items()):
+                        if isinstance(v, torch.Tensor) and v.is_cuda:
+                            value[k] = v.cpu()
+                            print(f"Moved aux['{key}']['{k}'] to CPU")
+        
+        return prediction
+
     def cleanup(self) -> None:
         """Clean up GPU memory."""