Spaces:

EarthSpeciesProject
/

NatureLM-Audio

Running on Zero

App Files Files Community

gagannarula commited on Aug 19, 2025

Commit

a774f0a

verified ·

1 Parent(s): 3522665

More UI changes and samples loading fix

Browse files

Files changed (1) hide show

app.py +102 -86

app.py CHANGED Viewed

@@ -195,44 +195,54 @@ def prompt_lm(
     return results
-def user_message(content):
-    return {"role": "user", "content": content}
-def add_user_query(
-    chatbot_history: list[dict], audio_input: str | None, chat_input: str
-) -> list[dict]:
     """Add user message to chat and get model response"""
     # Validate input
     if not chat_input.strip():
         return chatbot_history
-    if not audio_input:
-        chatbot_history.append({"role": "user", "content": chat_input.strip()})
-        chatbot_history.append({"role": "assistant", "content": "Thinking..."})
-        return chatbot_history
-    # Load audio with torchaudio and compute spectrogram
-    audio_tensor, sample_rate = torchaudio.load(audio_input)
-    spectrogram_fig = get_spectrogram(audio_tensor)
-    # Add gr.Plot to chatbot history
-    chatbot_history.append(
-        {"role": "user", "content": gr.Plot(spectrogram_fig, label="Spectrogram")}
-    )
-    # Add user message to chat history first
     chatbot_history.append({"role": "user", "content": chat_input.strip()})
-    chatbot_history.append({"role": "assistant", "content": "Thinking..."})
     return chatbot_history
-def get_response(
-    chatbot_history: list[dict], audio_input: str, chat_input: str
-) -> list[dict]:
     """Generate response from the model based on user input and audio file"""
     try:
         response = prompt_lm(
             audios=[audio_input],
-            queries=[chat_input.strip()],
             window_length_seconds=100_000,
             hop_length_seconds=100_000,
         )
@@ -250,11 +260,6 @@ def get_response(
     return chatbot_history
-def temp_func(chatbot_history: list[dict]):
-    # Search for the last user message that
-    pass
 def main(
     assets_dir: Path,
     cfg_path: str | Path,
@@ -283,31 +288,18 @@ def main(
     examples = {
         "Caption the audio (Lazuli Bunting)": [
-            [
-                user_message({"path": str(laz_audio)}),
-                user_message("Caption the audio."),
-            ]
         ],
         "Caption the audio (Green Tree Frog)": [
-            [
-                user_message({"path": str(frog_audio)}),
-                user_message(
-                    "Caption the audio, using the common name for any animal species."
-                ),
-            ]
         ],
         "Caption the audio (American Robin)": [
-            [
-                user_message({"path": str(robin_audio)}),
-                user_message("Caption the audio."),
-            ]
-        ],
-        "Caption the audio (Warbling Vireo)": [
-            [
-                user_message({"path": str(vireo_audio)}),
-                user_message("Caption the audio."),
-            ]
         ],
     }
     with gr.Blocks(
@@ -325,12 +317,12 @@ def main(
             with gr.Tab("Analyze Audio"):
                 uploaded_audio = gr.State()
                 # Status indicator
-                status_text = gr.Textbox(
-                    value=model_manager.get_status(),
-                    label="Model Status",
-                    interactive=False,
-                    visible=True,
-                )
                 with gr.Column(visible=True) as onboarding_message:
                     gr.HTML(
@@ -383,22 +375,12 @@ def main(
                         sources=["upload"],
                     )
                 with gr.Group(visible=False) as chat:
-                    chatbot = gr.Chatbot(
-                        elem_id="chatbot",
-                        type="messages",
-                        label="Chat",
-                        render_markdown=False,
-                        group_consecutive_messages=False,
-                        feedback_options=[
-                            "like",
-                            "dislike",
-                            "wrong species",
-                            "incorrect response",
-                            "other",
-                        ],
-                        resizeable=True,
                     )
-                    gr.Markdown("### Your Query")
                     task_dropdown = gr.Dropdown(
                         [
                             "What are the common names for the species in the audio, if any?",
@@ -418,21 +400,37 @@ def main(
                         info="Select a task or enter a custom query below",
                         value=None,
                     )
-                    def validate_and_submit(chatbot_history, audio_input, chat_input):
                         if not chat_input or not chat_input.strip():
                             gr.Warning("Please enter a query before sending.")
                             return chatbot_history, chat_input
                         # if this audio_input is the same as the CURRENT_AUDIO, set it None
                         # else update CURRENT_AUDIO
                         global CURRENT_AUDIO
-                        if audio_input == CURRENT_AUDIO:
-                            audio_in = None
-                        else:
                             CURRENT_AUDIO = audio_input
-                            audio_in = audio_input
-                        return add_user_query(chatbot_history, audio_in, chat_input)
                     chat_input = gr.Textbox(
                         placeholder="Enter a query and press Shift+Enter to send",
@@ -458,7 +456,8 @@ def main(
                     )
                     clear_button = gr.ClearButton(
-                        components=[chatbot, chat_input, audio_input], visible=False
                     )
                     def start_chat_interface(audio_path):
@@ -466,26 +465,42 @@ def main(
                             gr.update(visible=False),  # hide onboarding message
                             gr.update(visible=True),  # show upload section
                             gr.update(visible=True),  # show chat box
                         )
                     audio_input.change(
                         fn=start_chat_interface,
                         inputs=[audio_input],
-                        outputs=[onboarding_message, upload_section, chat],
                     )
                     chat_input.submit(
                         validate_and_submit,
-                        inputs=[chatbot, audio_input, chat_input],
-                        outputs=[chatbot],
                     ).then(
                         get_response,
-                        inputs=[chatbot, audio_input, chat_input],
                         outputs=[chatbot],
-                    ).then(lambda: gr.ClearButton(visible=True), None, [clear_button])
-                    if model_manager.is_loaded:
-                        status_text.update(value=model_manager.get_status())
                     clear_button.click(
                         lambda: gr.ClearButton(visible=False), None, [clear_button]
@@ -493,10 +508,11 @@ def main(
             with gr.Tab("Sample Library"):
                 gr.Markdown("## Sample Library\n\nExplore example audio files below.")
                 gr.Examples(
                     list(examples.values()),
-                    chatbot,
-                    chatbot,
                     example_labels=list(examples.keys()),
                     examples_per_page=20,
                 )

     return results
+def make_spectrogram_figure(audio_input: str) -> list[dict]:
+    # Load audio with torchaudio and compute spectrogram
+    if not audio_input:
+        # Return an empty figure if no audio input is provided
+        return get_spectrogram(torch.zeros(1, SAMPLE_RATE))
+    # Check if file exists and is accessible
+    try:
+        if not Path(audio_input).exists():
+            print(f"Audio file does not exist: {audio_input}")
+            return get_spectrogram(torch.zeros(1, SAMPLE_RATE))
+        if not Path(audio_input).is_file():
+            print(f"Path is not a valid file: {audio_input}")
+            return get_spectrogram(torch.zeros(1, SAMPLE_RATE))
+        audio_tensor, sample_rate = torchaudio.load(audio_input)
+        spectrogram_fig = get_spectrogram(audio_tensor)
+        return spectrogram_fig
+    except Exception as e:
+        print(f"Error loading audio file {audio_input}: {e}")
+        # Return an empty spectrogram on error
+        return get_spectrogram(torch.zeros(1, SAMPLE_RATE))
+def add_user_query(chatbot_history: list[dict], chat_input: str) -> list[dict]:
     """Add user message to chat and get model response"""
     # Validate input
     if not chat_input.strip():
         return chatbot_history
     chatbot_history.append({"role": "user", "content": chat_input.strip()})
     return chatbot_history
+def get_response(chatbot_history: list[dict], audio_input: str) -> list[dict]:
     """Generate response from the model based on user input and audio file"""
     try:
+        # Get the last user message from chat history
+        last_user_message = ""
+        for message in reversed(chatbot_history):
+            if message["role"] == "user":
+                last_user_message = message["content"]
+                break
         response = prompt_lm(
             audios=[audio_input],
+            queries=[last_user_message.strip()],
             window_length_seconds=100_000,
             hop_length_seconds=100_000,
         )
     return chatbot_history
 def main(
     assets_dir: Path,
     cfg_path: str | Path,
     examples = {
         "Caption the audio (Lazuli Bunting)": [
+            str(laz_audio),
+            "What is the common name for the focal species in the audio?",
         ],
         "Caption the audio (Green Tree Frog)": [
+            str(frog_audio),
+            "Caption the audio, using the common name for any animal species.",
         ],
         "Caption the audio (American Robin)": [
+            str(robin_audio),
+            "Caption the audio, using the scientific name for any animal species.",
         ],
+        "Caption the audio (Warbling Vireo)": [str(vireo_audio), "Caption the audio."],
     }
     with gr.Blocks(
             with gr.Tab("Analyze Audio"):
                 uploaded_audio = gr.State()
                 # Status indicator
+                # status_text = gr.Textbox(
+                #     value=model_manager.get_status(),
+                #     label="Model Status",
+                #     interactive=False,
+                #     visible=True,
+                # )
                 with gr.Column(visible=True) as onboarding_message:
                     gr.HTML(
                         sources=["upload"],
                     )
                 with gr.Group(visible=False) as chat:
+                    plotter = gr.Plot(
+                        get_spectrogram(torch.zeros(1, SAMPLE_RATE)),
+                        label="Spectrogram",
+                        visible=False,
+                        elem_id="spectrogram-plot",
                     )
                     task_dropdown = gr.Dropdown(
                         [
                             "What are the common names for the species in the audio, if any?",
                         info="Select a task or enter a custom query below",
                         value=None,
                     )
+                    chatbot = gr.Chatbot(
+                        elem_id="chatbot",
+                        type="messages",
+                        label="Chat",
+                        render_markdown=False,
+                        group_consecutive_messages=False,
+                        feedback_options=[
+                            "like",
+                            "dislike",
+                            "wrong species",
+                            "incorrect response",
+                            "other",
+                        ],
+                        resizeable=True,
+                    )
+                    gr.Markdown("### Your Query")
+                    def validate_and_submit(chatbot_history, chat_input):
                         if not chat_input or not chat_input.strip():
                             gr.Warning("Please enter a query before sending.")
                             return chatbot_history, chat_input
+                        updated_history = add_user_query(chatbot_history, chat_input)
+                        return updated_history, ""
+                    def update_current_audio(audio_input):
                         # if this audio_input is the same as the CURRENT_AUDIO, set it None
                         # else update CURRENT_AUDIO
                         global CURRENT_AUDIO
+                        if audio_input != CURRENT_AUDIO:
                             CURRENT_AUDIO = audio_input
                     chat_input = gr.Textbox(
                         placeholder="Enter a query and press Shift+Enter to send",
                     )
                     clear_button = gr.ClearButton(
+                        components=[chatbot, chat_input, audio_input, plotter],
+                        visible=False,
                     )
                     def start_chat_interface(audio_path):
                             gr.update(visible=False),  # hide onboarding message
                             gr.update(visible=True),  # show upload section
                             gr.update(visible=True),  # show chat box
+                            gr.update(visible=True),  # show plotter
                         )
+                    # When audio added, set spectrogram
                     audio_input.change(
                         fn=start_chat_interface,
                         inputs=[audio_input],
+                        outputs=[onboarding_message, upload_section, chat, plotter],
+                    ).then(
+                        fn=update_current_audio,
+                        inputs=[audio_input],
+                        outputs=[],
+                    ).then(
+                        fn=make_spectrogram_figure,
+                        inputs=[audio_input],
+                        outputs=[plotter],
                     )
+                    # When submit clicked first:
+                    # 1. Validate and add user query to chat history
+                    # 2. Get response from model
+                    # 3. Clear the chat input box
+                    # 4. Show clear button
                     chat_input.submit(
                         validate_and_submit,
+                        inputs=[chatbot, chat_input],
+                        outputs=[chatbot, chat_input],
                     ).then(
                         get_response,
+                        inputs=[chatbot, audio_input],
                         outputs=[chatbot],
+                    ).then(
+                        lambda: gr.update(visible=True),  # Show clear button
+                        None,
+                        [clear_button],
+                    )
                     clear_button.click(
                         lambda: gr.ClearButton(visible=False), None, [clear_button]
             with gr.Tab("Sample Library"):
                 gr.Markdown("## Sample Library\n\nExplore example audio files below.")
                 gr.Examples(
                     list(examples.values()),
+                    [audio_input, chat_input],
+                    [audio_input, chat_input],
                     example_labels=list(examples.keys()),
                     examples_per_page=20,
                 )