Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Sep 6, 2024

Commit

b74468d

1 Parent(s): cb0930a

update aligned, fix z-score

Browse files

Files changed (1) hide show

app.py +121 -25

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # Author: Huzheng Yang
 # %%
 import copy
 import os
 USE_HUGGINGFACE_ZEROGPU = os.getenv("USE_HUGGINGFACE_ZEROGPU", "False").lower() in ["true", "1", "yes"]
 DOWNLOAD_ALL_MODELS_DATASETS = os.getenv("DOWNLOAD_ALL_MODELS_DATASETS", "False").lower() in ["true", "1", "yes"]
@@ -241,7 +244,7 @@ def ncut_run(
     logging_str = ""
     if "AlignedThreeModelAttnNodes" == model_name:
         # dirty patch for the alignedcut paper
-        resolution = (672, 672)
     else:
         resolution = RES_DICT[model_name]
     logging_str += f"Resolution: {resolution}\n"
@@ -357,11 +360,18 @@ def ncut_run(
         if "AlignedThreeModelAttnNodes" == model_name:
             # dirty patch for the alignedcut paper
-            galleries = []
-            for i_node in range(rgb.shape[1]):
-                _rgb = rgb[:, i_node]
-                galleries.append(to_pil_images(_rgb, target_size=56))
-            return *galleries, logging_str
         if is_lisa == True:
             # dirty patch for the LISA model
@@ -457,9 +467,78 @@ def transform_image(image, resolution=(1024, 1024)):
     image = torch.tensor(np.array(image).transpose(2, 0, 1)).float()
     image = image / 255
     # Normalize
-    image = (image - 0.5) / 0.5
     return image
 def load_alignedthreemodel():
     os.system("git clone https://huggingface.co/huzey/alignedthreeattn >> /dev/null 2>&1")
@@ -687,10 +766,10 @@ def make_input_video_section():
     clear_images_button = gr.Button("🗑️Clear", elem_id='clear_button', variant='stop')
     return input_gallery, submit_button, clear_images_button, max_frames_number
-def make_dataset_images_section(advanced=False):
     gr.Markdown('### Load Datasets')
-    load_images_button = gr.Button("Load", elem_id="load-images-button", variant='secondary')
     advanced_radio = gr.Radio(["Basic", "Advanced"], label="Datasets", value="Advanced" if advanced else "Basic", elem_id="advanced-radio")
     with gr.Column() as basic_block:
         example_gallery = gr.Gallery(value=example_items, label="Example Set A", show_label=False, columns=[3], rows=[2], object_fit="scale-down", height="200px", show_share_button=False, elem_id="example-gallery")
@@ -700,10 +779,17 @@ def make_dataset_images_section(advanced=False):
         with gr.Row():
             dataset_dropdown = gr.Dropdown(dataset_names, label="Dataset name", value="mrm8488/ImageNet1K-val", elem_id="dataset", min_width=300)
             num_images_slider = gr.Number(10, label="Number of images", elem_id="num_images")
-            filter_by_class_checkbox = gr.Checkbox(label="Filter by class", value=True, elem_id="filter_by_class_checkbox")
-            filter_by_class_text = gr.Textbox(label="Class to select", value="0,33,99", elem_id="filter_by_class_text", info=f"e.g. `0,1,2`. (1000 classes)", visible=True)
-            is_random_checkbox = gr.Checkbox(label="Random shuffle", value=False, elem_id="random_seed_checkbox")
-            random_seed_slider = gr.Slider(0, 1000, step=1, label="Random seed", value=1, elem_id="random_seed", visible=False)
     if advanced:
         advanced_block.visible = True
@@ -1168,12 +1254,18 @@ with demo:
             with gr.Column(scale=5, min_width=200):
                 input_gallery, submit_button, clear_images_button = make_input_images_section()
-                dataset_dropdown, num_images_slider, random_seed_slider, load_images_button = make_dataset_images_section(advanced=True)
                 num_images_slider.value = 100
             with gr.Column(scale=5, min_width=200):
                 gr.Markdown('Model: CLIP(ViT-B-16/openai), DiNOv2reg(dinov2_vitb14_reg), MAE(vit_base)')
                 gr.Markdown('Layer type: attention output (attn), without sum of residual')
                 [
                     model_dropdown, layer_slider, node_type_dropdown, num_eig_slider,
                     affinity_focal_gamma_slider, num_sample_ncut_slider, knn_ncut_slider,
@@ -1185,20 +1277,23 @@ with demo:
                 model_dropdown.visible = False
                 layer_slider.visible = False
                 node_type_dropdown.visible = False
                 # logging text box
                 logging_text = gr.Textbox("Logging information", label="Logging", elem_id="logging", type="text", placeholder="Logging information")
-        galleries = []
-        for i_model, model_name in enumerate(["CLIP", "DINO", "MAE"]):
-            with gr.Row():
-                for i_layer in range(1, 13):
-                    with gr.Column(scale=5, min_width=200):
-                        gr.Markdown(f'### {model_name} Layer {i_layer}')
-                        output_gallery = gr.Gallery(value=[], label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto")
-                        galleries.append(output_gallery)
-        clear_images_button.click(lambda x: [] * (len(galleries) + 1), outputs=[input_gallery] + galleries)
         false_placeholder = gr.Checkbox(label="False", value=False, elem_id="false_placeholder", visible=False)
         no_prompt = gr.Textbox("", label="", elem_id="empty_placeholder", type="text", placeholder="", visible=False)
@@ -1213,7 +1308,8 @@ with demo:
                 embedding_method_dropdown, num_sample_tsne_slider, knn_tsne_slider,
                 perplexity_slider, n_neighbors_slider, min_dist_slider, sampling_method_dropdown
             ],
-            outputs=galleries + [logging_text],
         )
     with gr.Tab('Compare Models'):
@@ -1320,4 +1416,4 @@ if DOWNLOAD_ALL_MODELS_DATASETS:
 demo.launch(share=True)
-# %%

 # Author: Huzheng Yang
 # %%
 import copy
+from io import BytesIO
 import os
+from matplotlib import pyplot as plt
 USE_HUGGINGFACE_ZEROGPU = os.getenv("USE_HUGGINGFACE_ZEROGPU", "False").lower() in ["true", "1", "yes"]
 DOWNLOAD_ALL_MODELS_DATASETS = os.getenv("DOWNLOAD_ALL_MODELS_DATASETS", "False").lower() in ["true", "1", "yes"]
     logging_str = ""
     if "AlignedThreeModelAttnNodes" == model_name:
         # dirty patch for the alignedcut paper
+        resolution = (224, 224)
     else:
         resolution = RES_DICT[model_name]
     logging_str += f"Resolution: {resolution}\n"
         if "AlignedThreeModelAttnNodes" == model_name:
             # dirty patch for the alignedcut paper
+            # galleries = []
+            # for i_node in range(rgb.shape[1]):
+            #     _rgb = rgb[:, i_node]
+            #     galleries.append(to_pil_images(_rgb, target_size=56))
+            # return *galleries, logging_str
+            pil_images = []
+            for i_image in range(rgb.shape[0]):
+                _im = plot_one_image_36_grid(images[i_image], rgb[i_image])
+                pil_images.append(_im)
+            return pil_images, logging_str
         if is_lisa == True:
             # dirty patch for the LISA model
     image = torch.tensor(np.array(image).transpose(2, 0, 1)).float()
     image = image / 255
     # Normalize
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    image = (image - torch.tensor(mean).view(3, 1, 1)) / torch.tensor(std).view(3, 1, 1)
     return image
+def plot_one_image_36_grid(original_image, tsne_rgb_images):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    original_image = original_image * torch.tensor(std).view(3, 1, 1) + torch.tensor(mean).view(3, 1, 1)
+    original_image = torch.clamp(original_image, 0, 1)
+    fig = plt.figure(figsize=(20, 4))
+    grid = plt.GridSpec(3, 14, hspace=0.1, wspace=0.1)
+    ax1 = fig.add_subplot(grid[0:2, 0:2])
+    img = original_image.cpu().float().numpy().transpose(1, 2, 0)
+    def convert_and_pad_image(np_array, pad_size=20):
+        """
+        Converts a NumPy array of shape (height, width, 3) to a PNG image
+        and pads the right and bottom sides with a transparent background.
+        Args:
+            np_array (numpy.ndarray): Input NumPy array of shape (height, width, 3)
+            pad_size (int, optional): Number of pixels to pad on the right and bottom sides. Default is 20.
+        Returns:
+            PIL.Image: Padded PNG image with transparent background
+        """
+        # Convert NumPy array to PIL Image
+        img = Image.fromarray(np_array)
+        # Get the original size
+        width, height = img.size
+        # Create a new image with padding and transparent background
+        new_width = width + pad_size
+        new_height = height + pad_size
+        padded_img = Image.new('RGBA', (new_width, new_height), color=(255, 255, 255, 0))
+        # Paste the original image onto the padded image
+        padded_img.paste(img, (0, 0))
+        return padded_img
+    img = convert_and_pad_image((img*255).astype(np.uint8))
+    ax1.imshow(img)
+    ax1.axis('off')
+    model_names = ['CLIP', 'DINO', 'MAE']
+    for i_model, model_name in enumerate(model_names):
+        for i_layer in range(12):
+            ax = fig.add_subplot(grid[i_model, i_layer+2])
+            ax.imshow(tsne_rgb_images[i_layer+12*i_model].cpu().float().numpy())
+            ax.axis('off')
+            if i_model == 0:
+                ax.set_title(f'Layer{i_layer}', fontsize=16)
+            if i_layer == 0:
+                ax.text(-0.1, 0.5, model_name, va="center", ha="center", fontsize=16, transform=ax.transAxes, rotation=90,)
+    plt.tight_layout()
+    buf = BytesIO()
+    plt.savefig(buf, bbox_inches='tight', pad_inches=0, dpi=100)
+    buf.seek(0)  # Move to the start of the BytesIO buffer
+    img = Image.open(buf)
+    img = img.convert("RGB")
+    img = copy.deepcopy(img)
+    buf.close()
+    plt.close()
+    return img
 def load_alignedthreemodel():
     os.system("git clone https://huggingface.co/huzey/alignedthreeattn >> /dev/null 2>&1")
     clear_images_button = gr.Button("🗑️Clear", elem_id='clear_button', variant='stop')
     return input_gallery, submit_button, clear_images_button, max_frames_number
+def make_dataset_images_section(advanced=False, is_random=False):
     gr.Markdown('### Load Datasets')
+    load_images_button = gr.Button("🟢 Load Images", elem_id="load-images-button", variant='primary')
     advanced_radio = gr.Radio(["Basic", "Advanced"], label="Datasets", value="Advanced" if advanced else "Basic", elem_id="advanced-radio")
     with gr.Column() as basic_block:
         example_gallery = gr.Gallery(value=example_items, label="Example Set A", show_label=False, columns=[3], rows=[2], object_fit="scale-down", height="200px", show_share_button=False, elem_id="example-gallery")
         with gr.Row():
             dataset_dropdown = gr.Dropdown(dataset_names, label="Dataset name", value="mrm8488/ImageNet1K-val", elem_id="dataset", min_width=300)
             num_images_slider = gr.Number(10, label="Number of images", elem_id="num_images")
+            if not is_random:
+                filter_by_class_checkbox = gr.Checkbox(label="Filter by class", value=True, elem_id="filter_by_class_checkbox")
+                filter_by_class_text = gr.Textbox(label="Class to select", value="0,33,99", elem_id="filter_by_class_text", info=f"e.g. `0,1,2`. (1000 classes)", visible=True)
+                is_random_checkbox = gr.Checkbox(label="Random shuffle", value=False, elem_id="random_seed_checkbox")
+                random_seed_slider = gr.Slider(0, 1000, step=1, label="Random seed", value=1, elem_id="random_seed", visible=False)
+            if is_random:
+                filter_by_class_checkbox = gr.Checkbox(label="Filter by class", value=False, elem_id="filter_by_class_checkbox")
+                filter_by_class_text = gr.Textbox(label="Class to select", value="0,33,99", elem_id="filter_by_class_text", info=f"e.g. `0,1,2`. (1000 classes)", visible=False)
+                is_random_checkbox = gr.Checkbox(label="Random shuffle", value=True, elem_id="random_seed_checkbox")
+                random_seed_slider = gr.Slider(0, 1000, step=1, label="Random seed", value=42, elem_id="random_seed", visible=True)
     if advanced:
         advanced_block.visible = True
             with gr.Column(scale=5, min_width=200):
                 input_gallery, submit_button, clear_images_button = make_input_images_section()
+                dataset_dropdown, num_images_slider, random_seed_slider, load_images_button = make_dataset_images_section(advanced=True, is_random=True)
                 num_images_slider.value = 100
             with gr.Column(scale=5, min_width=200):
+                output_gallery = make_output_images_section()
+                gr.Markdown('### TIP1: use the `full-screen` button, and use `arrow keys` to navigate')
+                gr.Markdown('---')
                 gr.Markdown('Model: CLIP(ViT-B-16/openai), DiNOv2reg(dinov2_vitb14_reg), MAE(vit_base)')
                 gr.Markdown('Layer type: attention output (attn), without sum of residual')
+                gr.Markdown('### TIP2: for large image set, please increase the `num_sample` for t-SNE and NCUT')
+                gr.Markdown('---')
                 [
                     model_dropdown, layer_slider, node_type_dropdown, num_eig_slider,
                     affinity_focal_gamma_slider, num_sample_ncut_slider, knn_ncut_slider,
                 model_dropdown.visible = False
                 layer_slider.visible = False
                 node_type_dropdown.visible = False
+                num_sample_ncut_slider.value = 10000
+                num_sample_tsne_slider.value = 1000
                 # logging text box
                 logging_text = gr.Textbox("Logging information", label="Logging", elem_id="logging", type="text", placeholder="Logging information")
+        # galleries = []
+        # for i_model, model_name in enumerate(["CLIP", "DINO", "MAE"]):
+        #     with gr.Row():
+        #         for i_layer in range(1, 13):
+        #             with gr.Column(scale=5, min_width=200):
+        #                 gr.Markdown(f'### {model_name} Layer {i_layer}')
+        #                 output_gallery = gr.Gallery(value=[], label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto")
+        #                 galleries.append(output_gallery)
+        # clear_images_button.click(lambda x: [] * (len(galleries) + 1), outputs=[input_gallery] + galleries)
+        clear_images_button.click(lambda x: ([], []), outputs=[input_gallery, output_gallery])
         false_placeholder = gr.Checkbox(label="False", value=False, elem_id="false_placeholder", visible=False)
         no_prompt = gr.Textbox("", label="", elem_id="empty_placeholder", type="text", placeholder="", visible=False)
                 embedding_method_dropdown, num_sample_tsne_slider, knn_tsne_slider,
                 perplexity_slider, n_neighbors_slider, min_dist_slider, sampling_method_dropdown
             ],
+            # outputs=galleries + [logging_text],
+            outputs=[output_gallery, logging_text],
         )
     with gr.Tab('Compare Models'):
 demo.launch(share=True)
+# %%