Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on 16 days ago

Commit

995e5f1

verified ·

1 Parent(s): 27b3d88

Update ui/tabs.py

Browse files

Files changed (1) hide show

ui/tabs.py +71 -176

ui/tabs.py CHANGED Viewed

@@ -60,22 +60,22 @@ def create_gemini_realtime_tab():
     with gr.Blocks() as gemini_tab:
         gr.Markdown("""
-        # 🎯 Gemini Realtime API
-        **Audio Streaming Thời Gian Thực với Google Gemini**
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Connection controls
                 with gr.Group():
-                    gr.Markdown("### 🔗 Kết nối Audio")
                     api_key = gr.Textbox(
                         label="Gemini API Key",
                         type="password",
                         placeholder="Nhập API key của bạn...",
                         value=os.getenv("GEMINI_API_KEY", ""),
-                        info="Cần cho Audio Streaming"
                     )
                     voice_select = gr.Dropdown(
@@ -106,31 +106,17 @@ def create_gemini_realtime_tab():
                     )
             with gr.Column(scale=2):
-                # Chat interface
-                with gr.Group():
-                    gr.Markdown("### 💬 Hội thoại")
-                    chatbot = gr.Chatbot(
-                        label="Gemini Chat",
-                        type="messages",
-                        height=300,
-                        show_copy_button=True,
-                        value=[]
-                    )
                 # Audio Streaming Interface
                 with gr.Group():
                     gr.Markdown("### 🎤 Audio Streaming")
-                    with gr.Row():
-                        start_audio_btn = gr.Button("🎙️ Bắt đầu nói", variant="primary")
-                        stop_audio_btn = gr.Button("⏹️ Dừng nói", variant="secondary")
-                    transcription_display = gr.Textbox(
-                        label="Bạn nói",
-                        interactive=False,
-                        lines=2,
-                        placeholder="Văn bản nhận diện sẽ hiển thị ở đây..."
                     )
                     # Audio output for Gemini responses
@@ -140,24 +126,22 @@ def create_gemini_realtime_tab():
                         autoplay=True
                     )
-                    # Audio input for user
-                    audio_input = gr.Audio(
-                        label="🎤 Micro của bạn",
-                        sources=["microphone"],
-                        type="numpy",
-                        interactive=True
                     )
         # State management
         connection_state = gr.State(value=False)
-        audio_streaming_state = gr.State(value=False)
         gemini_service_state = gr.State(value=None)
-        async def connect_gemini(api_key, voice_name, current_chat):
             """Kết nối Gemini Audio Streaming"""
             try:
                 if not api_key:
-                    return False, "❌ Vui lòng nhập API Key", "Chưa kết nối", current_chat, None
                 service = GeminiRealtimeService(api_key)
@@ -165,18 +149,10 @@ def create_gemini_realtime_tab():
                 async def handle_gemini_callback(data):
                     if data['type'] == 'status':
                         gr.Info(data['message'])
-                        return data['message'], data['message'], current_chat
                     elif data['type'] == 'text':
-                        # Cập nhật transcription
-                        new_chat = current_chat + [{"role": "assistant", "content": data['content']}]
-                        return f"📝 {data['content']}", data['content'], new_chat
-                    elif data['type'] == 'audio':
-                        # Xử lý audio stream (sẽ được xử lý trong audio output)
-                        return "🔊 Đang nhận audio...", "audio_received", current_chat
                     elif data['type'] == 'error':
                         gr.Warning(data['message'])
-                        return f"❌ {data['message']}", data['message'], current_chat
-                    return "Unknown", "unknown", current_chat
                 success = await service.start_session(
                     voice_name=voice_name,
@@ -184,166 +160,82 @@ def create_gemini_realtime_tab():
                 )
                 if success:
-                    welcome_msg = f"Xin chào! Tôi là Gemini với giọng {voice_name}. Hãy bắt đầu nói chuyện!"
-                    new_chat = current_chat + [{"role": "assistant", "content": welcome_msg}]
-                    info_msg = f"Đã kết nối audio streaming - Giọng: {voice_name}"
-                    return True, "✅ Đã kết nối Audio Streaming", info_msg, new_chat, service
                 else:
-                    return False, "❌ Không thể kết nối audio", "Lỗi kết nối", current_chat, None
             except Exception as e:
                 error_msg = f"❌ Lỗi kết nối: {str(e)}"
-                return False, error_msg, f"Lỗi: {str(e)}", current_chat, None
-        async def disconnect_gemini(current_chat, service):
             """Ngắt kết nối"""
             if service:
                 await service.close()
-            new_chat = current_chat + [{"role": "assistant", "content": "Đã ngắt kết nối audio streaming."}]
-            return False, "🔌 Đã ngắt kết nối", "Ngắt kết nối", new_chat, None
-        async def start_audio_stream(service, audio_state):
-            """Bắt đầu stream audio"""
-            if not service or not service.is_active:
-                return False, "❌ Chưa kết nối. Vui lòng kết nối trước."
-            return True, "🎙️ Đang nghe... Hãy bắt đầu nói!"
-        async def stop_audio_stream(service, audio_state):
-            """Dừng stream audio"""
-            return False, "⏹️ Đã dừng thu âm"
-        async def process_audio_input(audio_data, sample_rate, service, current_chat):
-            """Xử lý audio input từ user"""
             if not service or not service.is_active:
-                return current_chat, "❌ Chưa kết nối audio", current_chat
             if audio_data is None:
-                return current_chat, "⚠️ Không có audio input", current_chat
             try:
                 # Gửi audio đến Gemini
                 success = await service.send_audio_chunk(audio_data, sample_rate)
-                if success:
-                    # Nhận audio response từ Gemini
                     audio_response = await service.receive_audio()
-                    if audio_response:
-                        resp_sample_rate, resp_audio_data = audio_response
-                        # Lưu audio response để phát
-                        audio_path = f"gemini_response_{int(asyncio.get_event_loop().time())}.wav"
                         import scipy.io.wavfile as wavfile
-                        wavfile.write(audio_path, resp_sample_rate, resp_audio_data)
-                        info_msg = "🔊 Đã nhận phản hồi audio từ Gemini"
-                        return current_chat, info_msg, audio_path
-                    else:
-                        info_msg = "⏳ Đang chờ phản hồi audio..."
-                        return current_chat, info_msg, None
                 else:
-                    return current_chat, "❌ Lỗi gửi audio", current_chat
             except Exception as e:
                 error_msg = f"❌ Lỗi xử lý audio: {str(e)}"
-                return current_chat, error_msg, current_chat
-        async def send_text_message(message, current_chat, service):
-            """Gửi tin nhắn text (fallback)"""
-            if not service or not service.is_active:
-                error_msg = "❌ Chưa kết nối. Vui lòng kết nối trước."
-                new_chat = current_chat + [{"role": "user", "content": message}, {"role": "assistant", "content": error_msg}]
-                return new_chat, "Lỗi kết nối", new_chat
-            if not message.strip():
-                return current_chat, "⚠️ Vui lòng nhập tin nhắn", current_chat
-            try:
-                # Hiển thị tin nhắn user
-                new_chat = current_chat + [{"role": "user", "content": message}]
-                # Gửi text
-                response = await service.send_text(message)
-                # Cập nhật response
-                new_chat = new_chat + [{"role": "assistant", "content": response}]
-                return new_chat, f"✅ Đã nhận phản hồi ({len(response)} ký tự)", new_chat
-            except Exception as e:
-                error_msg = f"❌ Lỗi: {str(e)}"
-                new_chat = current_chat + [{"role": "user", "content": message}, {"role": "assistant", "content": error_msg}]
-                return new_chat, error_msg, new_chat
-        def clear_chat():
-            """Xóa chat"""
-            return [], "🧹 Đã xóa chat", []
-        # Thêm text input
-        with gr.Row():
-            text_input = gr.Textbox(
-                label="Hoặc nhập tin nhắn text",
-                placeholder="Nhập tin nhắn text và nhấn Enter...",
-                scale=4
-            )
-            send_text_btn = gr.Button("📤 Gửi text", scale=1)
         # Event handlers
         connect_btn.click(
             connect_gemini,
-            inputs=[api_key, voice_select, chatbot],
-            outputs=[connection_state, status_display, connection_info, chatbot, gemini_service_state]
         )
         disconnect_btn.click(
             disconnect_gemini,
-            inputs=[chatbot, gemini_service_state],
-            outputs=[connection_state, status_display, connection_info, chatbot, gemini_service_state]
-        )
-        start_audio_btn.click(
-            start_audio_stream,
-            inputs=[gemini_service_state, audio_streaming_state],
-            outputs=[audio_streaming_state, transcription_display]
-        )
-        stop_audio_btn.click(
-            stop_audio_stream,
-            inputs=[gemini_service_state, audio_streaming_state],
-            outputs=[audio_streaming_state, transcription_display]
         )
-        # Audio input processing
         audio_input.stop_recording(
             process_audio_input,
-            inputs=[audio_input, audio_input, gemini_service_state, chatbot],
-            outputs=[chatbot, connection_info, audio_output]
-        )
-        # Text message handling
-        send_text_btn.click(
-            send_text_message,
-            inputs=[text_input, chatbot, gemini_service_state],
-            outputs=[chatbot, connection_info, chatbot]
-        ).then(
-            lambda: "",
-            outputs=[text_input]
-        )
-        text_input.submit(
-            send_text_message,
-            inputs=[text_input, chatbot, gemini_service_state],
-            outputs=[chatbot, connection_info, chatbot]
-        ).then(
-            lambda: "",
-            outputs=[text_input]
-        )
-        clear_btn = gr.Button("🧹 Xóa chat", variant="secondary")
-        clear_btn.click(
-            clear_chat,
-            outputs=[chatbot, connection_info, chatbot]
         )
         # Hướng dẫn sử dụng
@@ -357,17 +249,14 @@ def create_gemini_realtime_tab():
                - Nhấn **"Kết nối Audio"**
             2. **Trò chuyện bằng giọng nói**:
-               - Nhấn **"Bắt đầu nói"**
-               - Nói vào micro
-               - Gemini sẽ trả lời bằng audio ngay lập tức
-            3. **Hoặc chat bằng text**:
-               - Nhập tin nhắn trong ô text
-               - Nhấn Enter hoặc **"Gửi text"**
-            ### 🔊 Tính năng Audio Streaming:
             - 🎙️ Real-time voice recognition
-            - 🔊 Real-time audio response
             - ⚡ Ultra low latency
             - 🎯 Multiple voice options
@@ -375,6 +264,12 @@ def create_gemini_realtime_tab():
             - Sử dụng headset để chất lượng tốt hơn
             - Nói rõ ràng, không nói quá nhanh
             - Môi trường yên tĩnh cho kết quả tốt nhất
             """)
     return gemini_tab

     with gr.Blocks() as gemini_tab:
         gr.Markdown("""
+        # 🎯 Gemini Audio Streaming
+        **Trò chuyện thời gian thực bằng giọng nói với Google Gemini**
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Connection controls
                 with gr.Group():
+                    gr.Markdown("### 🔗 Kết nối")
                     api_key = gr.Textbox(
                         label="Gemini API Key",
                         type="password",
                         placeholder="Nhập API key của bạn...",
                         value=os.getenv("GEMINI_API_KEY", ""),
+                        info="Lấy từ https://aistudio.google.com/"
                     )
                     voice_select = gr.Dropdown(
                     )
             with gr.Column(scale=2):
                 # Audio Streaming Interface
                 with gr.Group():
                     gr.Markdown("### 🎤 Audio Streaming")
+                    # Audio input for user
+                    audio_input = gr.Audio(
+                        label="🎤 Nhấn để nói chuyện với Gemini",
+                        sources=["microphone"],
+                        type="numpy",
+                        interactive=True,
+                        show_download_button=False
                     )
                     # Audio output for Gemini responses
                         autoplay=True
                     )
+                    transcription_display = gr.Textbox(
+                        label="💬 Nội dung hội thoại",
+                        interactive=False,
+                        lines=3,
+                        placeholder="Nội dung cuộc trò chuyện sẽ hiển thị ở đây..."
                     )
         # State management
         connection_state = gr.State(value=False)
         gemini_service_state = gr.State(value=None)
+        async def connect_gemini(api_key, voice_name):
             """Kết nối Gemini Audio Streaming"""
             try:
                 if not api_key:
+                    return False, "❌ Vui lòng nhập API Key", "Chưa kết nối", None
                 service = GeminiRealtimeService(api_key)
                 async def handle_gemini_callback(data):
                     if data['type'] == 'status':
                         gr.Info(data['message'])
                     elif data['type'] == 'text':
+                        gr.Info(f"Gemini: {data['content']}")
                     elif data['type'] == 'error':
                         gr.Warning(data['message'])
                 success = await service.start_session(
                     voice_name=voice_name,
                 )
                 if success:
+                    info_msg = f"✅ Đã kết nối Audio Streaming\nGiọng: {voice_name}\nHãy sử dụng micro để trò chuyện"
+                    return True, "✅ Đã kết nối Audio", info_msg, service
                 else:
+                    return False, "❌ Không thể kết nối audio", "Lỗi kết nối", None
             except Exception as e:
                 error_msg = f"❌ Lỗi kết nối: {str(e)}"
+                return False, error_msg, f"Lỗi: {str(e)}", None
+        async def disconnect_gemini(service):
             """Ngắt kết nối"""
             if service:
                 await service.close()
+            return False, "🔌 Đã ngắt kết nối", "Đã ngắt kết nối audio streaming", None
+        async def process_audio_input(audio_data, sample_rate, service):
+            """Xử lý audio input từ user và trả lời bằng audio"""
             if not service or not service.is_active:
+                return None, "❌ Chưa kết nối. Vui lòng kết nối audio trước.", "Chưa kết nối"
             if audio_data is None:
+                return None, "⚠️ Không có audio input", "Không có audio"
             try:
                 # Gửi audio đến Gemini
                 success = await service.send_audio_chunk(audio_data, sample_rate)
+                if not success:
+                    return None, "❌ Lỗi gửi audio đến Gemini", "Lỗi gửi audio"
+                # Chờ và nhận audio response từ Gemini
+                audio_response = None
+                max_attempts = 50  # Chờ tối đa 5 giây
+                for attempt in range(max_attempts):
                     audio_response = await service.receive_audio()
+                    if audio_response is not None:
+                        break
+                    await asyncio.sleep(0.1)  # Chờ 100ms giữa các lần thử
+                if audio_response:
+                    resp_sample_rate, resp_audio_data = audio_response
+                    # Lưu audio response vào file tạm
+                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                         import scipy.io.wavfile as wavfile
+                        wavfile.write(f.name, resp_sample_rate, resp_audio_data)
+                        audio_path = f.name
+                    info_msg = f"✅ Đã nhận phản hồi audio từ Gemini ({(len(resp_audio_data) / resp_sample_rate):.1f}s)"
+                    return audio_path, info_msg, "Thành công"
                 else:
+                    return None, "⏳ Không nhận được phản hồi audio từ Gemini", "Timeout"
             except Exception as e:
                 error_msg = f"❌ Lỗi xử lý audio: {str(e)}"
+                return None, error_msg, f"Lỗi: {str(e)}"
         # Event handlers
         connect_btn.click(
             connect_gemini,
+            inputs=[api_key, voice_select],
+            outputs=[connection_state, status_display, connection_info, gemini_service_state]
         )
         disconnect_btn.click(
             disconnect_gemini,
+            inputs=[gemini_service_state],
+            outputs=[connection_state, status_display, connection_info, gemini_service_state]
         )
+        # Xử lý audio input
         audio_input.stop_recording(
             process_audio_input,
+            inputs=[audio_input, audio_input, gemini_service_state],
+            outputs=[audio_output, connection_info, transcription_display]
         )
         # Hướng dẫn sử dụng
                - Nhấn **"Kết nối Audio"**
             2. **Trò chuyện bằng giọng nói**:
+               - Nhấn nút **Micro** để bắt đầu ghi âm
+               - Nói câu hỏi của bạn
+               - Nhấn **Dừng** để kết thúc ghi âm
+               - Gemini sẽ trả lời bằng giọng nói ngay lập tức
+            ### 🔊 Tính năng:
             - 🎙️ Real-time voice recognition
+            - 🔊 Real-time audio response
             - ⚡ Ultra low latency
             - 🎯 Multiple voice options
             - Sử dụng headset để chất lượng tốt hơn
             - Nói rõ ràng, không nói quá nhanh
             - Môi trường yên tĩnh cho kết quả tốt nhất
+            - Mỗi lần ghi âm nên ngắn hơn 30 giây
+            ### 🔧 Lưu ý kỹ thuật:
+            - Cần API Key Gemini có quyền Realtime API
+            - Audio được stream real-time đến Gemini
+            - Phản hồi audio được stream về và phát tự động
             """)
     return gemini_tab