Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 23

Commit

ff9d355

verified ·

1 Parent(s): 1129e66

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +517 -49

services/streaming_voice_service.py CHANGED Viewed

@@ -1,8 +1,376 @@
 import io
 import numpy as np
 import soundfile as sf
 import time
 import traceback
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 from config.settings import settings
@@ -22,10 +390,17 @@ class StreamingVoiceService:
         self.vad_processor = SileroVAD()
         self.is_listening = False
         self.speech_callback = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe với VAD"""
@@ -33,9 +408,11 @@ class StreamingVoiceService:
             return False
         self.speech_callback = speech_callback
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
             self.is_listening = True
             print("🎙️ Đã bắt đầu lắng nghe với VAD")
         return success
@@ -43,28 +420,46 @@ class StreamingVoiceService:
         """Dừng lắng nghe"""
         self.vad_processor.stop_stream()
         self.is_listening = False
         self.speech_callback = None
         print("🛑 Đã dừng lắng nghe")
     def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
-        if not audio_data or not self.is_listening:
             return {
-                'transcription': "",
                 'response': "",
-                'tts_audio': None
             }
         try:
             sample_rate, audio_array = audio_data
             # Xử lý với VAD
             self.vad_processor.process_stream(audio_array, sample_rate)
             return {
                 'transcription': "Đang lắng nghe...",
                 'response': "",
-                'tts_audio': None
             }
         except Exception as e:
@@ -72,36 +467,81 @@ class StreamingVoiceService:
             return {
                 'transcription': "",
                 'response': "",
-                'tts_audio': None
             }
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
         """Callback khi VAD phát hiện speech"""
         print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
-        # Chuyển đổi speech thành text
-        transcription = self._transcribe_audio(speech_audio, sample_rate)
-        if not transcription or len(transcription.strip()) < 2:
-            print("⚠️ Transcription quá ngắn hoặc trống")
             return
-        print(f"📝 VAD Transcription: {transcription}")
-        self.current_transcription = transcription
-        # Tạo phản hồi AI
-        response = self._generate_ai_response(transcription)
-        # Tạo TTS
-        tts_audio_path = self._text_to_speech(response)
-        # Gửi kết quả đến callback
-        if self.speech_callback:
-            self.speech_callback({
-                'transcription': transcription,
-                'response': response,
-                'tts_audio': tts_audio_path
-            })
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming (phương thức cũ cho compatibility)"""
@@ -109,7 +549,17 @@ class StreamingVoiceService:
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
                 'response': "Vui lòng nói lại",
-                'tts_audio': None
             }
         try:
@@ -129,7 +579,8 @@ class StreamingVoiceService:
                 return {
                     'transcription': "❌ Âm thanh trống",
                     'response': "Vui lòng nói lại",
-                    'tts_audio': None
                 }
             # Tính toán âm lượng
@@ -141,7 +592,8 @@ class StreamingVoiceService:
                 return {
                     'transcription': "❌ Âm thanh quá yếu",
                     'response': "Xin vui lòng nói to hơn",
-                    'tts_audio': None
                 }
             # Sử dụng VAD để kiểm tra speech
@@ -149,7 +601,8 @@ class StreamingVoiceService:
                 return {
                     'transcription': "❌ Không phát hiện giọng nói",
                     'response': "Vui lòng nói rõ hơn",
-                    'tts_audio': None
                 }
             # Chuyển đổi thành văn bản
@@ -159,7 +612,8 @@ class StreamingVoiceService:
                 return {
                     'transcription': "❌ Không nghe rõ",
                     'response': "Xin vui lòng nói lại rõ hơn",
-                    'tts_audio': None
                 }
             # Kiểm tra nếu transcription quá ngắn
@@ -167,7 +621,8 @@ class StreamingVoiceService:
                 return {
                     'transcription': "❌ Câu nói quá ngắn",
                     'response': "Xin vui lòng nói câu dài hơn",
-                    'tts_audio': None
                 }
             print(f"📝 Đã chuyển đổi: {transcription}")
@@ -184,7 +639,8 @@ class StreamingVoiceService:
             return {
                 'transcription': transcription,
                 'response': response,
-                'tts_audio': tts_audio_path
             }
         except Exception as e:
@@ -193,11 +649,12 @@ class StreamingVoiceService:
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
                 'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
-                'tts_audio': None
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
-        """Chuyển audio -> text với xử lý sample rate"""
         try:
             # Đảm bảo kiểu dữ liệu là int16
             if audio_data.dtype != np.int16:
@@ -235,18 +692,27 @@ class StreamingVoiceService:
             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
-            # Gọi API Whisper
-            transcription = self.client.audio.transcriptions.create(
-                model=settings.WHISPER_MODEL,
-                file=("speech.wav", buffer.read(), "audio/wav"),
-                response_format="text",
-                language="vi",
-                temperature=0.0,
-            )
             # Xử lý response
             if hasattr(transcription, 'text'):
@@ -265,7 +731,7 @@ class StreamingVoiceService:
             return None
     def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-        """Resample audio sử dụng scipy"""
         try:
             from scipy import signal
@@ -273,11 +739,11 @@ class StreamingVoiceService:
             duration = len(audio_data) / orig_sr
             new_length = int(duration * target_sr)
-            # Resample sử dụng scipy.signal.resample
             resampled_audio = signal.resample(audio_data, new_length)
             # Chuyển lại về int16
-            resampled_audio = resampled_audio.astype(np.int16)
             return resampled_audio
@@ -298,7 +764,7 @@ class StreamingVoiceService:
             return audio_data
     def _generate_ai_response(self, user_input: str) -> str:
-        """Sinh phản hồi AI"""
         try:
             # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
@@ -334,10 +800,11 @@ Thông tin tham khảo:
             return response
         except Exception as e:
-            return f"Xin lỗi, tôi gặp lỗi khi tạo phản hồi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
@@ -361,7 +828,8 @@ Thông tin tham khảo:
         """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")
-        }

+# import io
+# import numpy as np
+# import soundfile as sf
+# import time
+# import traceback
+# from groq import Groq
+# from typing import Optional, Dict, Any, Callable
+# from config.settings import settings
+# from core.rag_system import EnhancedRAGSystem
+# from core.tts_service import EnhancedTTSService
+# from core.speechbrain_vad import SpeechBrainVAD
+# from core.silero_vad import SileroVAD
+# class StreamingVoiceService:
+#     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
+#         self.client = groq_client
+#         self.rag_system = rag_system
+#         self.tts_service = tts_service
+#         # Khởi tạo VAD
+#         self.vad_processor = SileroVAD()
+#         self.is_listening = False
+#         self.speech_callback = None
+#         # Conversation context
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#     def start_listening(self, speech_callback: Callable) -> bool:
+#         """Bắt đầu lắng nghe với VAD"""
+#         if self.is_listening:
+#             return False
+#         self.speech_callback = speech_callback
+#         success = self.vad_processor.start_stream(self._on_speech_detected)
+#         if success:
+#             self.is_listening = True
+#             print("🎙️ Đã bắt đầu lắng nghe với VAD")
+#         return success
+#     def stop_listening(self):
+#         """Dừng lắng nghe"""
+#         self.vad_processor.stop_stream()
+#         self.is_listening = False
+#         self.speech_callback = None
+#         print("🛑 Đã dừng lắng nghe")
+#     def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
+#         """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
+#         if not audio_data or not self.is_listening:
+#             return {
+#                 'transcription': "",
+#                 'response': "",
+#                 'tts_audio': None
+#             }
+#         try:
+#             sample_rate, audio_array = audio_data
+#             # Xử lý với VAD
+#             self.vad_processor.process_stream(audio_array, sample_rate)
+#             return {
+#                 'transcription': "Đang lắng nghe...",
+#                 'response': "",
+#                 'tts_audio': None
+#             }
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý audio chunk: {e}")
+#             return {
+#                 'transcription': "",
+#                 'response': "",
+#                 'tts_audio': None
+#             }
+#     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+#         """Callback khi VAD phát hiện speech"""
+#         print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
+#         # Chuyển đổi speech thành text
+#         transcription = self._transcribe_audio(speech_audio, sample_rate)
+#         if not transcription or len(transcription.strip()) < 2:
+#             print("⚠️ Transcription quá ngắn hoặc trống")
+#             return
+#         print(f"📝 VAD Transcription: {transcription}")
+#         self.current_transcription = transcription
+#         # Tạo phản hồi AI
+#         response = self._generate_ai_response(transcription)
+#         # Tạo TTS
+#         tts_audio_path = self._text_to_speech(response)
+#         # Gửi kết quả đến callback
+#         if self.speech_callback:
+#             self.speech_callback({
+#                 'transcription': transcription,
+#                 'response': response,
+#                 'tts_audio': tts_audio_path
+#             })
+#     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+#         """Xử lý audio streaming (phương thức cũ cho compatibility)"""
+#         if not audio_data:
+#             return {
+#                 'transcription': "❌ Không có dữ liệu âm thanh",
+#                 'response': "Vui lòng nói lại",
+#                 'tts_audio': None
+#             }
+#         try:
+#             # Lấy dữ liệu audio từ Gradio
+#             sample_rate, audio_array = audio_data
+#             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
+#             # Kiểm tra kiểu dữ liệu và chuyển đổi nếu cần
+#             if isinstance(audio_array, np.ndarray):
+#                 if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
+#                     # Chuyển từ float sang int16
+#                     audio_array = (audio_array * 32767).astype(np.int16)
+#             # Kiểm tra audio có dữ liệu không
+#             if len(audio_array) == 0:
+#                 return {
+#                     'transcription': "❌ Âm thanh trống",
+#                     'response': "Vui lòng nói lại",
+#                     'tts_audio': None
+#                 }
+#             # Tính toán âm lượng
+#             audio_abs = np.abs(audio_array.astype(np.float32))
+#             audio_rms = np.sqrt(np.mean(audio_abs**2)) / 32767.0
+#             print(f"📊 Âm lượng RMS: {audio_rms:.4f}")
+#             if audio_rms < 0.005:
+#                 return {
+#                     'transcription': "❌ Âm thanh quá yếu",
+#                     'response': "Xin vui lòng nói to hơn",
+#                     'tts_audio': None
+#                 }
+#             # Sử dụng VAD để kiểm tra speech
+#             if not self.vad_processor.is_speech(audio_array, sample_rate):
+#                 return {
+#                     'transcription': "❌ Không phát hiện giọng nói",
+#                     'response': "Vui lòng nói rõ hơn",
+#                     'tts_audio': None
+#                 }
+#             # Chuyển đổi thành văn bản
+#             transcription = self._transcribe_audio(audio_array, sample_rate)
+#             if not transcription or len(transcription.strip()) == 0:
+#                 return {
+#                     'transcription': "❌ Không nghe rõ",
+#                     'response': "Xin vui lòng nói lại rõ hơn",
+#                     'tts_audio': None
+#                 }
+#             # Kiểm tra nếu transcription quá ngắn
+#             if len(transcription.strip()) < 2:
+#                 return {
+#                     'transcription': "❌ Câu nói quá ngắn",
+#                     'response': "Xin vui lòng nói câu dài hơn",
+#                     'tts_audio': None
+#                 }
+#             print(f"📝 Đã chuyển đổi: {transcription}")
+#             # Cập nhật transcription hiện tại
+#             self.current_transcription = transcription
+#             # Tạo phản hồi AI
+#             response = self._generate_ai_response(transcription)
+#             # Tạo TTS
+#             tts_audio_path = self._text_to_speech(response)
+#             return {
+#                 'transcription': transcription,
+#                 'response': response,
+#                 'tts_audio': tts_audio_path
+#             }
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý streaming audio: {e}")
+#             print(f"Chi tiết lỗi: {traceback.format_exc()}")
+#             return {
+#                 'transcription': f"❌ Lỗi: {str(e)}",
+#                 'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
+#                 'tts_audio': None
+#             }
+#     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+#         """Chuyển audio -> text với xử lý sample rate"""
+#         try:
+#             # Đảm bảo kiểu dữ liệu là int16
+#             if audio_data.dtype != np.int16:
+#                 if audio_data.dtype in [np.float32, np.float64]:
+#                     audio_data = (audio_data * 32767).astype(np.int16)
+#                 else:
+#                     audio_data = audio_data.astype(np.int16)
+#             # Chuẩn hóa audio data
+#             if audio_data.ndim > 1:
+#                 audio_data = np.mean(audio_data, axis=1).astype(np.int16)  # Chuyển sang mono
+#             # Resample nếu sample rate không phải 16000Hz (Whisper yêu cầu)
+#             target_sample_rate = 16000
+#             if sample_rate != target_sample_rate:
+#                 audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
+#                 sample_rate = target_sample_rate
+#                 print(f"🔄 Đã resample từ {sample_rate}Hz xuống {target_sample_rate}Hz")
+#             # Giới hạn độ dài audio
+#             max_duration = 10  # giây
+#             max_samples = sample_rate * max_duration
+#             if len(audio_data) > max_samples:
+#                 audio_data = audio_data[:max_samples]
+#                 print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
+#             # Đảm bảo audio đủ dài
+#             min_duration = 0.5  # giây
+#             min_samples = int(sample_rate * min_duration)
+#             if len(audio_data) < min_samples:
+#                 # Pad audio nếu quá ngắn
+#                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
+#                 audio_data = np.concatenate([audio_data, padding])
+#                 print(f"⚠️ Đã pad audio lên {min_duration} giây")
+#             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
+#             buffer = io.BytesIO()
+#             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
+#             buffer.seek(0)
+#             # Gọi API Whisper
+#             transcription = self.client.audio.transcriptions.create(
+#                 model=settings.WHISPER_MODEL,
+#                 file=("speech.wav", buffer.read(), "audio/wav"),
+#                 response_format="text",
+#                 language="vi",
+#                 temperature=0.0,
+#             )
+#             # Xử lý response
+#             if hasattr(transcription, 'text'):
+#                 result = transcription.text.strip()
+#             elif isinstance(transcription, str):
+#                 result = transcription.strip()
+#             else:
+#                 result = str(transcription).strip()
+#             print(f"✅ Transcription thành công: '{result}'")
+#             return result
+#         except Exception as e:
+#             print(f"❌ Lỗi transcription: {e}")
+#             print(f"Audio details: dtype={audio_data.dtype}, shape={audio_data.shape}, sr={sample_rate}")
+#             return None
+#     def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+#         """Resample audio sử dụng scipy"""
+#         try:
+#             from scipy import signal
+#             # Tính số samples mới
+#             duration = len(audio_data) / orig_sr
+#             new_length = int(duration * target_sr)
+#             # Resample sử dụng scipy.signal.resample
+#             resampled_audio = signal.resample(audio_data, new_length)
+#             # Chuyển lại về int16
+#             resampled_audio = resampled_audio.astype(np.int16)
+#             return resampled_audio
+#         except ImportError:
+#             print("⚠️ Không có scipy, sử dụng simple resampling")
+#             # Simple resampling bằng interpolation
+#             orig_length = len(audio_data)
+#             new_length = int(orig_length * target_sr / orig_sr)
+#             # Linear interpolation
+#             x_old = np.linspace(0, 1, orig_length)
+#             x_new = np.linspace(0, 1, new_length)
+#             resampled_audio = np.interp(x_new, x_old, audio_data).astype(np.int16)
+#             return resampled_audio
+#         except Exception as e:
+#             print(f"❌ Lỗi resample: {e}")
+#             return audio_data
+#     def _generate_ai_response(self, user_input: str) -> str:
+#         """Sinh phản hồi AI"""
+#         try:
+#             # Thêm vào lịch sử
+#             self.conversation_history.append({"role": "user", "content": user_input})
+#             # Tìm kiếm RAG
+#             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
+#             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
+#             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+# Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
+# Thông tin tham khảo:
+# {context_text}
+# """
+#             messages = [{"role": "system", "content": system_prompt}]
+#             # Giữ lại 4 tin nhắn gần nhất
+#             messages.extend(self.conversation_history[-4:])
+#             completion = self.client.chat.completions.create(
+#                 model="llama-3.1-8b-instant",
+#                 messages=messages,
+#                 max_tokens=150,
+#                 temperature=0.7
+#             )
+#             response = completion.choices[0].message.content
+#             self.conversation_history.append({"role": "assistant", "content": response})
+#             # Giới hạn lịch sử
+#             if len(self.conversation_history) > 8:
+#                 self.conversation_history = self.conversation_history[-8:]
+#             return response
+#         except Exception as e:
+#             return f"Xin lỗi, tôi gặp lỗi khi tạo phản hồi: {str(e)}"
+#     def _text_to_speech(self, text: str) -> Optional[str]:
+#         """Chuyển văn bản thành giọng nói"""
+#         try:
+#             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
+#                 return None
+#             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
+#             if tts_bytes:
+#                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
+#                 print(f"✅ Đã tạo TTS: {audio_path}")
+#                 return audio_path
+#         except Exception as e:
+#             print(f"❌ Lỗi TTS: {e}")
+#         return None
+#     def clear_conversation(self):
+#         """Xóa lịch sử hội thoại"""
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#         print("🗑️ Đã xóa lịch sử hội thoại")
+#     def get_conversation_state(self) -> dict:
+#         """Lấy trạng thái hội thoại"""
+#         return {
+#             'is_listening': self.is_listening,
+#             'history_length': len(self.conversation_history),
+#             'current_transcription': self.current_transcription,
+#             'last_update': time.strftime("%H:%M:%S")
+#         }
 import io
 import numpy as np
 import soundfile as sf
 import time
 import traceback
+import threading
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 from config.settings import settings
         self.vad_processor = SileroVAD()
         self.is_listening = False
         self.speech_callback = None
+        self.is_processing = False  # Tránh xử lý chồng chéo
+        self.last_speech_time = 0
+        self.silence_timeout = 2.0  # 2 giây im lặng thì dừng
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
+        # Audio buffer for VAD
+        self.audio_buffer = []
+        self.buffer_lock = threading.Lock()
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe với VAD"""
             return False
         self.speech_callback = speech_callback
+        self.last_speech_time = time.time()
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
             self.is_listening = True
+            self.is_processing = False
             print("🎙️ Đã bắt đầu lắng nghe với VAD")
         return success
         """Dừng lắng nghe"""
         self.vad_processor.stop_stream()
         self.is_listening = False
+        self.is_processing = False
         self.speech_callback = None
+        with self.buffer_lock:
+            self.audio_buffer = []
         print("🛑 Đã dừng lắng nghe")
     def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
+        if not audio_data or not self.is_listening or self.is_processing:
             return {
+                'transcription': "Đang lắng nghe...",
                 'response': "",
+                'tts_audio': None,
+                'status': 'listening'
             }
         try:
             sample_rate, audio_array = audio_data
+            # Thêm vào buffer và xử lý với VAD
+            with self.buffer_lock:
+                self.audio_buffer.extend(audio_array)
+                # Giới hạn buffer để tránh tràn bộ nhớ
+                max_buffer_samples = sample_rate * 10  # 10 giây
+                if len(self.audio_buffer) > max_buffer_samples:
+                    self.audio_buffer = self.audio_buffer[-max_buffer_samples:]
             # Xử lý với VAD
             self.vad_processor.process_stream(audio_array, sample_rate)
+            # Kiểm tra timeout im lặng
+            current_time = time.time()
+            if current_time - self.last_speech_time > self.silence_timeout and len(self.audio_buffer) > 0:
+                self._process_final_audio()
             return {
                 'transcription': "Đang lắng nghe...",
                 'response': "",
+                'tts_audio': None,
+                'status': 'listening'
             }
         except Exception as e:
             return {
                 'transcription': "",
                 'response': "",
+                'tts_audio': None,
+                'status': 'error'
             }
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
         """Callback khi VAD phát hiện speech"""
         print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
+        self.last_speech_time = time.time()
+        # Chỉ xử lý nếu không đang xử lý cái khác
+        if self.is_processing:
+            print("⚠️ Đang xử lý request trước đó, bỏ qua...")
             return
+        self.is_processing = True
+        try:
+            # Chuyển đổi speech thành text
+            transcription = self._transcribe_audio(speech_audio, sample_rate)
+            if not transcription or len(transcription.strip()) < 2:
+                print("⚠️ Transcription quá ngắn hoặc trống")
+                self.is_processing = False
+                return
+            print(f"📝 VAD Transcription: {transcription}")
+            self.current_transcription = transcription
+            # Tạo phản hồi AI
+            response = self._generate_ai_response(transcription)
+            # Tạo TTS
+            tts_audio_path = self._text_to_speech(response)
+            # Gửi kết quả đến callback
+            if self.speech_callback:
+                self.speech_callback({
+                    'transcription': transcription,
+                    'response': response,
+                    'tts_audio': tts_audio_path,
+                    'status': 'completed'
+                })
+        except Exception as e:
+            print(f"❌ Lỗi trong _on_speech_detected: {e}")
+        finally:
+            # Cho phép xử lý tiếp sau khi TTS kết thúc
+            threading.Timer(1.0, self._reset_processing).start()
+    def _reset_processing(self):
+        """Reset trạng thái xử lý sau khi hoàn thành"""
+        self.is_processing = False
+        with self.buffer_lock:
+            self.audio_buffer = []
+    def _process_final_audio(self):
+        """Xử lý audio cuối cùng khi hết thời gian im lặng"""
+        if self.is_processing or not self.audio_buffer:
+            return
+        try:
+            with self.buffer_lock:
+                if not self.audio_buffer:
+                    return
+                final_audio = np.array(self.audio_buffer)
+                self.audio_buffer = []
+            # Chỉ xử lý nếu audio đủ dài
+            if len(final_audio) > 16000 * 0.5:  # Ít nhất 0.5 giây
+                print("🔄 Xử lý audio cuối cùng do im lặng timeout")
+                self._on_speech_detected(final_audio, 16000)
+        except Exception as e:
+            print(f"❌ Lỗi xử lý final audio: {e}")
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming (phương thức cũ cho compatibility)"""
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
                 'response': "Vui lòng nói lại",
+                'tts_audio': None,
+                'status': 'error'
+            }
+        # Nếu đang xử lý VAD, trả về trạng thái listening
+        if self.is_processing:
+            return {
+                'transcription': "Đang xử lý...",
+                'response': "",
+                'tts_audio': None,
+                'status': 'processing'
             }
         try:
                 return {
                     'transcription': "❌ Âm thanh trống",
                     'response': "Vui lòng nói lại",
+                    'tts_audio': None,
+                    'status': 'error'
                 }
             # Tính toán âm lượng
                 return {
                     'transcription': "❌ Âm thanh quá yếu",
                     'response': "Xin vui lòng nói to hơn",
+                    'tts_audio': None,
+                    'status': 'error'
                 }
             # Sử dụng VAD để kiểm tra speech
                 return {
                     'transcription': "❌ Không phát hiện giọng nói",
                     'response': "Vui lòng nói rõ hơn",
+                    'tts_audio': None,
+                    'status': 'error'
                 }
             # Chuyển đổi thành văn bản
                 return {
                     'transcription': "❌ Không nghe rõ",
                     'response': "Xin vui lòng nói lại rõ hơn",
+                    'tts_audio': None,
+                    'status': 'error'
                 }
             # Kiểm tra nếu transcription quá ngắn
                 return {
                     'transcription': "❌ Câu nói quá ngắn",
                     'response': "Xin vui lòng nói câu dài hơn",
+                    'tts_audio': None,
+                    'status': 'error'
                 }
             print(f"📝 Đã chuyển đổi: {transcription}")
             return {
                 'transcription': transcription,
                 'response': response,
+                'tts_audio': tts_audio_path,
+                'status': 'completed'
             }
         except Exception as e:
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
                 'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
+                'tts_audio': None,
+                'status': 'error'
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+        """Chuyển audio -> text với xử lý sample rate cải tiến"""
         try:
             # Đảm bảo kiểu dữ liệu là int16
             if audio_data.dtype != np.int16:
             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
+            # Tạo temporary file trong memory
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
+            # Gọi API Whisper với timeout
+            import requests
+            try:
+                transcription = self.client.audio.transcriptions.create(
+                    model=settings.WHISPER_MODEL,
+                    file=("speech.wav", buffer.read(), "audio/wav"),
+                    response_format="text",
+                    language="vi",
+                    temperature=0.0,
+                )
+            except requests.exceptions.Timeout:
+                print("❌ Whisper API timeout")
+                return None
+            except Exception as e:
+                print(f"❌ Lỗi Whisper API: {e}")
+                return None
             # Xử lý response
             if hasattr(transcription, 'text'):
             return None
     def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio sử dụng scipy - cải tiến độ chính xác"""
         try:
             from scipy import signal
             duration = len(audio_data) / orig_sr
             new_length = int(duration * target_sr)
+            # Resample sử dụng scipy.signal.resample với windowing
             resampled_audio = signal.resample(audio_data, new_length)
             # Chuyển lại về int16
+            resampled_audio = np.clip(resampled_audio, -32768, 32767).astype(np.int16)
             return resampled_audio
             return audio_data
     def _generate_ai_response(self, user_input: str) -> str:
+        """Sinh phản hồi AI với xử lý lỗi"""
         try:
             # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
             return response
         except Exception as e:
+            print(f"❌ Lỗi tạo AI response: {e}")
+            return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
     def _text_to_speech(self, text: str) -> Optional[str]:
+        """Chuyển văn bản thành giọng nói với xử lý lỗi"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
         """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
+            'is_processing': self.is_processing,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")
+        }