Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 23

Commit

06ce176

verified ·

1 Parent(s): 8289f9b

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +93 -2

services/streaming_voice_service.py CHANGED Viewed

@@ -4,10 +4,11 @@ import soundfile as sf
 import time
 import traceback
 from groq import Groq
-from typing import Optional, Dict, Any
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
 class StreamingVoiceService:
@@ -16,12 +17,93 @@ class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming từ Gradio microphone component"""
         if not audio_data:
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
@@ -61,6 +143,14 @@ class StreamingVoiceService:
                     'tts_audio': None
                 }
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
@@ -269,6 +359,7 @@ Thông tin tham khảo:
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")

 import time
 import traceback
 from groq import Groq
+from typing import Optional, Dict, Any, Callable
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
+from core.speechbrain_vad import SpeechBrainVAD
 class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
+        # Khởi tạo VAD
+        self.vad_processor = SpeechBrainVAD()
+        self.is_listening = False
+        self.speech_callback = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
+    def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe với VAD"""
+        if self.is_listening:
+            return False
+        self.speech_callback = speech_callback
+        success = self.vad_processor.start_stream(self._on_speech_detected)
+        if success:
+            self.is_listening = True
+            print("🎙️ Đã bắt đầu lắng nghe với VAD")
+        return success
+    def stop_listening(self):
+        """Dừng lắng nghe"""
+        self.vad_processor.stop_stream()
+        self.is_listening = False
+        self.speech_callback = None
+        print("🛑 Đã dừng lắng nghe")
+    def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
+        if not audio_data or not self.is_listening:
+            return {
+                'transcription': "",
+                'response': "",
+                'tts_audio': None
+            }
+        try:
+            sample_rate, audio_array = audio_data
+            # Xử lý với VAD
+            self.vad_processor.process_stream(audio_array, sample_rate)
+            return {
+                'transcription': "Đang lắng nghe...",
+                'response': "",
+                'tts_audio': None
+            }
+        except Exception as e:
+            print(f"❌ Lỗi xử lý audio chunk: {e}")
+            return {
+                'transcription': "",
+                'response': "",
+                'tts_audio': None
+            }
+    def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+        """Callback khi VAD phát hiện speech"""
+        print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
+        # Chuyển đổi speech thành text
+        transcription = self._transcribe_audio(speech_audio, sample_rate)
+        if not transcription or len(transcription.strip()) < 2:
+            print("⚠️ Transcription quá ngắn hoặc trống")
+            return
+        print(f"📝 VAD Transcription: {transcription}")
+        self.current_transcription = transcription
+        # Tạo phản hồi AI
+        response = self._generate_ai_response(transcription)
+        # Tạo TTS
+        tts_audio_path = self._text_to_speech(response)
+        # Gửi kết quả đến callback
+        if self.speech_callback:
+            self.speech_callback({
+                'transcription': transcription,
+                'response': response,
+                'tts_audio': tts_audio_path
+            })
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming (phương thức cũ cho compatibility)"""
         if not audio_data:
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
                     'tts_audio': None
                 }
+            # Sử dụng VAD để kiểm tra speech
+            if not self.vad_processor.is_speech(audio_array, sample_rate):
+                return {
+                    'transcription': "❌ Không phát hiện giọng nói",
+                    'response': "Vui lòng nói rõ hơn",
+                    'tts_audio': None
+                }
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
+            'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")