Spaces:

sergio-sanz-rodriguez
/

transform-eats

Sleeping

App Files Files Community

sergio-sanz-rodriguez commited on Jan 6, 2025

Commit

f1dd2a9

1 Parent(s): aeaa073

updated app, with the new algorithm using two ViTs

Browse files

Files changed (3) hide show

app.py +110 -39
class_names.txt +2 -2
food_descriptions.json +2 -2

app.py CHANGED Viewed

@@ -13,10 +13,13 @@ from torchvision.transforms import v2
 # Specify class names
 food_vision_class_names_path = "class_names.txt"
 with open(food_vision_class_names_path, "r") as f:
-    class_names = f.read().splitlines()
 # Specify number of classes
-num_classes = len(class_names) - 1 # 101, "unknown" to be discarded
 # Load the food description file
 food_descriptions_json = "food_descriptions.json"
@@ -32,18 +35,26 @@ effnetb0_model = create_effnetb0(
     compile=True
     )
-# Load the ViT-Base/16 transformer with input image of 384x384 pixels
-vitbase_model = create_vitbase_model(
     model_weights_dir=".",
     model_weights_name="vitbase16_2_2024-12-31.pth",
     img_size=384,
-    num_classes=num_classes,
     compile=True
 )
 # Specify manual transforms for model_2
 transforms = v2.Compose([
-    v2.Resize(384), #v2.Resize((384, 384)),
     v2.CenterCrop((384, 384)),
     v2.ToImage(),
     v2.ToDtype(torch.float32, scale=True),
@@ -51,66 +62,126 @@ transforms = v2.Compose([
                 std=[0.229, 0.224, 0.225])
 ])
 # Put models into evaluation mode and turn on inference mode
 effnetb0_model.eval()
-vitbase_model.eval()
 # Set thresdholds
 BINARY_CLASSIF_THR = 0.9989122152328491
 MULTICLASS_CLASSIF_THR = 0.5
 ENTROPY_THR = 2.6
-# Predict function
-def predict(image) -> Tuple[Dict, str, str]:
     """Transforms and performs a prediction on image and returns prediction and time taken.
     """
     try:
         # Start the timer
         start_time = timer()
         # Transform the target image and add a batch dimension
         image = transforms(image).unsqueeze(0)
         # Make prediction...
         with torch.inference_mode():
             # If the picture is food
             if effnetb0_model(image)[:,1].cpu() >= BINARY_CLASSIF_THR:
-                # Pass the transformed image through the model and turn the prediction logits into prediction probabilities
-                pred_probs = torch.softmax(vitbase_model(image), dim=1) # 101 classes
-                # Calculate entropy
-                entropy = -torch.sum(pred_probs * torch.log(pred_probs), dim=1).item()
-                # Create a prediction label and prediction probability dictionary for each prediction class
-                pred_classes_and_probs = {class_names[i]: float(pred_probs[0][i]) for i in range(num_classes)}
-                pred_classes_and_probs["unknown"] = 0.0
-                # Get the top predicted class
-                top_class = max(pred_classes_and_probs, key=pred_classes_and_probs.get)
-                # If the image is likely to be an unknown category
-                if pred_probs[0][class_names.index(top_class)] <= MULTICLASS_CLASSIF_THR and entropy > ENTROPY_THR:
-                    # Create prediction label and prediction probability for class unknown and rescale the rest of predictions
-                    pred_classes_and_probs["unknown"] = pred_probs.max() * 1.25
-                    prob_sum = sum(pred_classes_and_probs.values())
-                    pred_classes_and_probs = {key: value / prob_sum for key, value in pred_classes_and_probs.items()}
                     # Get the top predicted class
-                    top_class = "unknown"
             # Otherwise
             else:
                 # Set all probabilites to zero except class unknown
-                pred_classes_and_probs = {class_names[i]: 0.0 for i in range(num_classes)}
                 pred_classes_and_probs["unknown"] = 1.0
                 # Get the top predicted class
                 top_class = "unknown"
         # Get the description of the top predicted class
         top_class_description = food_descriptions.get(top_class, "Description not available.")
@@ -133,22 +204,21 @@ description = f"""
 A cutting-edge Vision Transformer (ViT) model to classify 101 delicious food types. Discover the power of AI in culinary recognition.
 ### Supported Food Types
-{', '.join(class_names[:-1])}.
 """
 # Configure the upload image area
 upload_input = gr.Image(type="pil", label="Upload Image", sources=['upload'], show_label=True, mirror_webcam=False)
 # Configure the dropdown option
-#model_dropdown = gr.Dropdown(
-#    choices=["Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)",
-#             "Vision Transformer - 224x224 pixels (lower accuracy, faster predictions)"],
-#    value="Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)",
-#    label="Select Model:"
-#)
 # Configure the sample image area
-food_vision_examples = [["examples/" + example] for example in os.listdir("examples")]
 # Author
 article = "Created by Sergio Sanz."
@@ -159,15 +229,16 @@ article = "Created by Sergio Sanz."
 # Create the Gradio demo
 demo = gr.Interface(fn=predict,                                                # mapping function from input to outputs
-                    inputs=upload_input,                                       # inputs #[upload_input, model_dropdown]
                     outputs=[gr.Label(num_top_classes=3, label="Prediction"),
                              gr.Textbox(label="Prediction time:"),
                              gr.Textbox(label="Food Description:")],           # outputs
-                    examples=food_vision_examples,                             # Create examples list from "examples/" directory
-                    cache_examples=True,                                       # Cache the examples
                     title=title,                                               # Title of the app
                     description=description,                                   # Brief description of the app
                     article=article,                                           # Created by...
                     theme="ocean")                                             # Theme
 # Launch the demo!

 # Specify class names
 food_vision_class_names_path = "class_names.txt"
 with open(food_vision_class_names_path, "r") as f:
+    class_names_102 = f.read().splitlines()
+class_names_101 = class_names_102.copy()
+class_names_101.remove("unknown")
 # Specify number of classes
+num_classes_102 = len(class_names_102) # 101 + unknown
+num_classes_101 = len(class_names_101) # 101
 # Load the food description file
 food_descriptions_json = "food_descriptions.json"
     compile=True
     )
+# Load the ViT-Base/16 transformer with input image of 384x384 pixels and 101 + unknown classes
+vitbase_model_102 = create_vitbase_model(
+    model_weights_dir=".",
+    model_weights_name="vitbase16_102_2025-01-07.pth",
+    img_size=384,
+    num_classes=num_classes_102,
+    compile=True
+)
+vitbase_model_101 = create_vitbase_model(
     model_weights_dir=".",
     model_weights_name="vitbase16_2_2024-12-31.pth",
     img_size=384,
+    num_classes=num_classes_101,
     compile=True
 )
 # Specify manual transforms for model_2
 transforms = v2.Compose([
+    v2.Resize((384)), #v2.Resize((384, 384)),
     v2.CenterCrop((384, 384)),
     v2.ToImage(),
     v2.ToDtype(torch.float32, scale=True),
                 std=[0.229, 0.224, 0.225])
 ])
 # Put models into evaluation mode and turn on inference mode
 effnetb0_model.eval()
+vitbase_model_102.eval()
+vitbase_model_101.eval()
 # Set thresdholds
 BINARY_CLASSIF_THR = 0.9989122152328491
 MULTICLASS_CLASSIF_THR = 0.5
 ENTROPY_THR = 2.6
+# Set model names
+lite_model = "⚡ Lite (faster, less accurate)"
+pro_model = "💎 Pro  (slower, more accurate)"
+# Set allow flagging
+allow_flagging = "never" # "manual"
+# Predict method
+def predict(image, model=pro_model) -> Tuple[Dict, str, str]:
     """Transforms and performs a prediction on image and returns prediction and time taken.
     """
     try:
         # Start the timer
         start_time = timer()
         # Transform the target image and add a batch dimension
         image = transforms(image).unsqueeze(0)
         # Make prediction...
         with torch.inference_mode():
             # If the picture is food
             if effnetb0_model(image)[:,1].cpu() >= BINARY_CLASSIF_THR:
+                # If Pro
+                if model == pro_model:
+                    # Pass the transformed image through the model and turn the prediction logits into prediction probabilities
+                    pred_probs_102 = torch.softmax(vitbase_model_102(image), dim=1)
+                    pred_probs_101 = torch.softmax(vitbase_model_101(image), dim=1)
+                    # Calculate entropy
+                    entropy = -torch.sum(pred_probs_101 * torch.log(pred_probs_101), dim=1).item()
+                    # Create a prediction label and prediction probability dictionary for each prediction class
+                    pred_classes_and_probs_102 = {class_names_102[i]: float(pred_probs_102[0][i]) for i in range(num_classes_102)}
+                    pred_classes_and_probs_101 = {class_names_101[i]: float(pred_probs_101[0][i]) for i in range(num_classes_101)}
+                    pred_classes_and_probs_101["unknown"] = 0.0
+                    # Get the top predicted class
+                    top_class_102 = max(pred_classes_and_probs_102, key=pred_classes_and_probs_102.get)
+                    sec_class_102 = sorted(pred_classes_and_probs_102.items(), key=lambda x: x[1], reverse=True)[1][0]
+                    top_class_101 = max(pred_classes_and_probs_101, key=pred_classes_and_probs_101.get)
+                    # If the image is likely to be an unknown category
+                    if pred_probs_101[0][class_names_101.index(top_class_101)] <= MULTICLASS_CLASSIF_THR and entropy > ENTROPY_THR:
+                        # Create prediction label and prediction probability for class unknown and rescale the rest of predictions
+                        pred_classes_and_probs_101["unknown"] = pred_probs_101.max() * 1.25
+                        prob_sum = sum(pred_classes_and_probs_101.values())
+                        pred_classes_and_probs = {key: value / prob_sum for key, value in pred_classes_and_probs_101.items()}
+                        # Get the top predicted class
+                        top_class = "unknown"
+                    elif ((top_class_101 == sec_class_102) and (top_class_102 == "unknown")) or (top_class_101 == top_class_102):
+                        # Get the probability vector
+                        pred_classes_and_probs = pred_classes_and_probs_101
+                        # Get the top predicted class
+                        top_class = top_class_101
+                    else:
+                        # Get the probability vector
+                        pred_classes_and_probs = pred_classes_and_probs_102
+                        # Get the top predicted class
+                        top_class = top_class_102
+                # Otherwise
+                else:
+                    # Pass the transformed image through the model and turn the prediction logits into prediction probabilities
+                    pred_probs = torch.softmax(vitbase_model_101(image), dim=1) # 101 classes
+                    # Calculate entropy
+                    entropy = -torch.sum(pred_probs * torch.log(pred_probs), dim=1).item()
+                    # Create a prediction label and prediction probability dictionary for each prediction class
+                    pred_classes_and_probs = {class_names_101[i]: float(pred_probs[0][i]) for i in range(num_classes_101)}
+                    pred_classes_and_probs["unknown"] = 0.0
                     # Get the top predicted class
+                    top_class = max(pred_classes_and_probs, key=pred_classes_and_probs.get)
+                    # If the image is likely to be an unknown category
+                    if pred_probs[0][class_names_101.index(top_class)] <= MULTICLASS_CLASSIF_THR and entropy > ENTROPY_THR:
+                        # Create prediction label and prediction probability for class unknown and rescale the rest of predictions
+                        pred_classes_and_probs["unknown"] = pred_probs.max() * 1.25
+                        prob_sum = sum(pred_classes_and_probs.values())
+                        pred_classes_and_probs = {key: value / prob_sum for key, value in pred_classes_and_probs.items()}
+                        # Get the top predicted class
+                        top_class = "unknown"
             # Otherwise
             else:
                 # Set all probabilites to zero except class unknown
+                pred_classes_and_probs = {class_names_101[i]: 0.0 for i in range(num_classes_101)}
                 pred_classes_and_probs["unknown"] = 1.0
                 # Get the top predicted class
                 top_class = "unknown"
         # Get the description of the top predicted class
         top_class_description = food_descriptions.get(top_class, "Description not available.")
 A cutting-edge Vision Transformer (ViT) model to classify 101 delicious food types. Discover the power of AI in culinary recognition.
 ### Supported Food Types
+{', '.join(class_names_102[:-1])}.
 """
 # Configure the upload image area
 upload_input = gr.Image(type="pil", label="Upload Image", sources=['upload'], show_label=True, mirror_webcam=False)
 # Configure the dropdown option
+model_dropdown = gr.Dropdown(
+    choices=[lite_model, pro_model],
+    value=pro_model,
+    label="Select ViT Model:"
+)
 # Configure the sample image area
+# food_vision_examples = [["examples/" + example] for example in os.listdir("examples")]
 # Author
 article = "Created by Sergio Sanz."
 # Create the Gradio demo
 demo = gr.Interface(fn=predict,                                                # mapping function from input to outputs
+                    inputs=[upload_input, model_dropdown],                     # inputs
                     outputs=[gr.Label(num_top_classes=3, label="Prediction"),
                              gr.Textbox(label="Prediction time:"),
                              gr.Textbox(label="Food Description:")],           # outputs
+                    #examples=food_vision_examples,                            # Create examples list from "examples/" directory
+                    #cache_examples=True,                                       # Cache the examples
                     title=title,                                               # Title of the app
                     description=description,                                   # Brief description of the app
                     article=article,                                           # Created by...
+                    allow_flagging=allow_flagging,                             # Only For debugging
                     theme="ocean")                                             # Theme
 # Launch the demo!

class_names.txt CHANGED Viewed

@@ -98,5 +98,5 @@ tacos
 takoyaki
 tiramisu
 tuna tartare
-waffles
-unknown

 takoyaki
 tiramisu
 tuna tartare
+unknown
+waffles

food_descriptions.json CHANGED Viewed

@@ -99,6 +99,6 @@
     "takoyaki": "Japanese snack made from batter, octopus, and tempura bits, served with takoyaki sauce.",
     "tiramisu": "Italian dessert with coffee-soaked ladyfingers, mascarpone cheese, and cocoa powder.",
     "tuna tartare": "Finely diced raw tuna, often mixed with soy sauce and served as an appetizer.",
-    "waffles": "Batter-based dish cooked in a grid pattern, served with syrup, fruit, or whipped cream.",
-    "unknown": "No sufficient confidence to classify the image."
 }

     "takoyaki": "Japanese snack made from batter, octopus, and tempura bits, served with takoyaki sauce.",
     "tiramisu": "Italian dessert with coffee-soaked ladyfingers, mascarpone cheese, and cocoa powder.",
     "tuna tartare": "Finely diced raw tuna, often mixed with soy sauce and served as an appetizer.",
+    "unknown": "No sufficient confidence to classify the image.",
+    "waffles": "Batter-based dish cooked in a grid pattern, served with syrup, fruit, or whipped cream."
 }