Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on Apr 25

Commit

47b7da6

verified ·

1 Parent(s): f40229f

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -24

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 First-Last-Frame-to-Video (FLF2V)
-Author: GeradeHouse
 """
 import numpy as np
@@ -16,46 +17,44 @@ import torchvision.transforms.functional as TF
 # ---------------------------------------------------------------------
 # CONFIG ----------------------------------------------------------------
 MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"  # or switch to 1.3B
-DTYPE          = torch.float16                            # or bfloat16
 MAX_AREA       = 1280 * 720                                # ≤720p
-DEFAULT_FRAMES = 81                                        # ~5s @16 fps
 # ----------------------------------------------------------------------
 def load_pipeline():
-    """Lazy‐load & configure the pipeline once per process."""
-    # 1) load the CLIP image encoder (full-precision)
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    # 2) load the VAE (half-precision)
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
-    # 3) load the video pipeline
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
         image_encoder=image_encoder,
         torch_dtype=DTYPE,
     )
-    # 4) override the processor with the fast Rust implementation
     pipe.image_processor = CLIPImageProcessor.from_pretrained(
         MODEL_ID, subfolder="image_processor", use_fast=True
     )
-    # 5) memory helpers (offload UNet to CPU as needed)
-    # pipe.enable_model_cpu_offload()
-    # (Removed pipe.vae.enable_slicing() — not supported on AutoencoderKLWan)
-    return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
 PIPE = load_pipeline()
 # ----------------------------------------------------------------------
 # UTILS ----------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
-    """Resize while keeping aspect & respecting patch multiples."""
     ar = img.height / img.width
     mod = PIPE.vae_scale_factor_spatial * PIPE.transformer.config.patch_size[1]
     h = round(np.sqrt(max_area * ar)) // mod * mod
@@ -63,11 +62,10 @@ def aspect_resize(img: Image.Image, max_area=MAX_AREA):
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h, w):
-    """Center‐crop & resize to H×W."""
     ratio = max(w / img.width, h / img.height)
     img = img.resize(
-        (round(img.width * ratio), round(img.height * ratio)),
-        Image.LANCZOS
     )
     return TF.center_crop(img, [h, w])
@@ -76,7 +74,7 @@ def center_crop_resize(img: Image.Image, h, w):
 def generate(first_frame, last_frame, prompt, negative_prompt, steps,
              guidance, num_frames, seed, fps):
-    # seed handling
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
@@ -86,7 +84,7 @@ def generate(first_frame, last_frame, prompt, negative_prompt, steps,
     if last_frame.size != first_frame.size:
         last_frame = center_crop_resize(last_frame, h, w)
-    # run the pipeline
     output = PIPE(
         image=first_frame,
         last_image=last_frame,
@@ -99,9 +97,9 @@ def generate(first_frame, last_frame, prompt, negative_prompt, steps,
         guidance_scale=guidance,
         generator=gen,
     )
-    frames = output.frames[0]  # list of PIL Image
-    # export to MP4
     video_path = export_to_video(frames, fps=fps)
     return video_path, seed
@@ -112,10 +110,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
-        last_img  = gr.Image(label="Last frame",  type="pil")
-    prompt         = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
-    negative       = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
         steps      = gr.Slider(10, 50, value=30, step=1, label="Sampling steps")

 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 First-Last-Frame-to-Video (FLF2V)
+Uses Accelerate’s automatic device mapping for optimal CPU/GPU placement.
+Author: <your-handle>
 """
 import numpy as np
 # ---------------------------------------------------------------------
 # CONFIG ----------------------------------------------------------------
 MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"  # or switch to 1.3B
+DTYPE          = torch.float16                            # or torch.bfloat16
 MAX_AREA       = 1280 * 720                                # ≤720p
+DEFAULT_FRAMES = 81                                        # ~5s @16fps
 # ----------------------------------------------------------------------
 def load_pipeline():
+    """Load & auto-map the pipeline across CPU/GPU with low CPU memory usage."""
+    # 1) load vision encoder (full precision)
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
+    # 2) load VAE (half precision)
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
+    # 3) load the video pipeline with Accelerate helpers
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
         image_encoder=image_encoder,
         torch_dtype=DTYPE,
+        low_cpu_mem_usage=True,   # lazy-load weights into CPU RAM
+        device_map="auto",        # auto-split across CPU/GPU
     )
+    # 4) use the fast Rust-backed processor
     pipe.image_processor = CLIPImageProcessor.from_pretrained(
         MODEL_ID, subfolder="image_processor", use_fast=True
     )
+    return pipe
 PIPE = load_pipeline()
 # ----------------------------------------------------------------------
 # UTILS ----------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
+    """Resize while keeping aspect and patch-size multiples."""
     ar = img.height / img.width
     mod = PIPE.vae_scale_factor_spatial * PIPE.transformer.config.patch_size[1]
     h = round(np.sqrt(max_area * ar)) // mod * mod
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h, w):
+    """Center-crop & resize to target H×W."""
     ratio = max(w / img.width, h / img.height)
     img = img.resize(
+        (round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS
     )
     return TF.center_crop(img, [h, w])
 def generate(first_frame, last_frame, prompt, negative_prompt, steps,
              guidance, num_frames, seed, fps):
+    # handle seed
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
     if last_frame.size != first_frame.size:
         last_frame = center_crop_resize(last_frame, h, w)
+    # inference
     output = PIPE(
         image=first_frame,
         last_image=last_frame,
         guidance_scale=guidance,
         generator=gen,
     )
+    frames = output.frames[0]  # list[PIL.Image]
+    # export to mp4
     video_path = export_to_video(frames, fps=fps)
     return video_path, seed
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
+        last_img  = gr.Image(label="Last frame", type="pil")
+    prompt   = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
+    negative = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
         steps      = gr.Slider(10, 50, value=30, step=1, label="Sampling steps")