Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 22

Commit

b5e51ac

verified ·

1 Parent(s): c6367cc

Create speechbrain_vad.py

Browse files

Files changed (1) hide show

core/speechbrain_vad.py +125 -132

core/speechbrain_vad.py CHANGED Viewed

@@ -1,154 +1,147 @@
 import torch
 import torchaudio
 import numpy as np
-from speechbrain.inference import VAD
-from typing import List, Tuple, Optional
-import queue
-import threading
-import time
 from config.settings import settings
 class SpeechBrainVAD:
     def __init__(self):
-        self.vad_model = None
         self.sample_rate = settings.SAMPLE_RATE
-        self.threshold = settings.VAD_THRESHOLD
-        self.min_silence_duration = settings.VAD_MIN_SILENCE_DURATION
-        self.speech_pad_duration = settings.VAD_SPEECH_PAD_DURATION
-        self.is_running = False
-        self.audio_queue = queue.Queue()
-        self.speech_buffer = []
-        self.silence_start_time = None
-        self.callback = None
         self._initialize_model()
     def _initialize_model(self):
-        """Khởi tạo mô hình VAD từ SpeechBrain"""
         try:
-            print("🔄 Đang tải mô hình SpeechBrain VAD...")
-            self.vad_model = VAD.from_hparams(
                 source=settings.VAD_MODEL,
-                savedir=f"pretrained_models/{settings.VAD_MODEL}"
             )
-            print("✅ Đã tải mô hình VAD thành công")
         except Exception as e:
-            print(f"❌ Lỗi tải mô hình VAD: {e}")
-            self.vad_model = None
-    def preprocess_audio(self, audio_data: np.ndarray, original_sr: int) -> np.ndarray:
-        """Tiền xử lý audio cho VAD"""
-        if original_sr != self.sample_rate:
-            # Resample audio to VAD sample rate
-            audio_tensor = torch.from_numpy(audio_data).float()
-            if len(audio_tensor.shape) > 1:
-                audio_tensor = audio_tensor.mean(dim=0)  # Convert to mono
-            resampler = torchaudio.transforms.Resample(
-                orig_freq=original_sr,
-                new_freq=self.sample_rate
             )
-            audio_tensor = resampler(audio_tensor)
-            audio_data = audio_tensor.numpy()
-        # Normalize audio
-        if np.max(np.abs(audio_data)) > 0:
-            audio_data = audio_data / np.max(np.abs(audio_data))
-        return audio_data
-    def detect_voice_activity(self, audio_chunk: np.ndarray) -> bool:
-        """Phát hiện hoạt động giọng nói trong audio chunk"""
-        if self.vad_model is None:
-            # Fallback: simple energy-based VAD
-            return self._energy_based_vad(audio_chunk)
         try:
-            # Convert to tensor and add batch dimension
-            audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
-            # Get VAD probabilities
-            with torch.no_grad():
-                prob = self.vad_model.get_speech_prob_chunk(audio_tensor)
-            return prob.item() > self.threshold
         except Exception as e:
-            print(f"❌ Lỗi VAD detection: {e}")
-            return self._energy_based_vad(audio_chunk)
-    def _energy_based_vad(self, audio_chunk: np.ndarray) -> bool:
-        """Fallback VAD dựa trên năng lượng âm thanh"""
-        energy = np.mean(audio_chunk ** 2)
-        return energy > 0.01  # Simple threshold
-    def process_stream(self, audio_chunk: np.ndarray, original_sr: int):
-        """Xử lý audio stream real-time"""
-        if not self.is_running:
-            return
-        # Preprocess audio
-        processed_audio = self.preprocess_audio(audio_chunk, original_sr)
-        # Detect voice activity
-        is_speech = self.detect_voice_activity(processed_audio)
-        if is_speech:
-            self.silence_start_time = None
-            self.speech_buffer.extend(processed_audio)
-            print("🎤 Đang nói...")
-        else:
-            # Silence detected
-            if self.silence_start_time is None:
-                self.silence_start_time = time.time()
-            elif len(self.speech_buffer) > 0:
-                silence_duration = time.time() - self.silence_start_time
-                if silence_duration >= self.min_silence_duration:
-                    # End of speech segment
-                    self._process_speech_segment()
-        return is_speech
-    def _process_speech_segment(self):
-        """Xử lý segment giọng nói khi kết thúc"""
-        if len(self.speech_buffer) == 0:
-            return
-        # Convert buffer to numpy array
-        speech_audio = np.array(self.speech_buffer)
-        # Call callback with speech segment
-        if self.callback and callable(self.callback):
-            self.callback(speech_audio, self.sample_rate)
-        # Clear buffer
-        self.speech_buffer = []
-        self.silence_start_time = None
-        print("✅ Đã xử lý segment giọng nói")
-    def start_stream(self, callback: callable):
-        """Bắt đầu xử lý stream"""
-        self.is_running = True
-        self.callback = callback
-        self.speech_buffer = []
-        self.silence_start_time = None
-        print("🎙️ Bắt đầu stream VAD...")
-    def stop_stream(self):
-        """Dừng xử lý stream"""
-        self.is_running = False
-        # Process any remaining speech
-        if len(self.speech_buffer) > 0:
-            self._process_speech_segment()
-        print("🛑 Đã dừng stream VAD")
-    def get_audio_chunk_from_stream(self, stream, chunk_size: int = 1024):
-        """Lấy audio chunk từ stream (for microphone input)"""
-        try:
-            data = stream.read(chunk_size, exception_on_overflow=False)
-            audio_data = np.frombuffer(data, dtype=np.int16)
-            return audio_data.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
-        except Exception as e:
-            print(f"❌ Lỗi đọc audio stream: {e}")
-            return None

 import torch
 import torchaudio
 import numpy as np
+from typing import Optional, Callable
 from config.settings import settings
 class SpeechBrainVAD:
     def __init__(self):
+        self.model = None
         self.sample_rate = settings.SAMPLE_RATE
+        self.is_streaming = False
+        self.speech_callback = None
+        self.audio_buffer = []
         self._initialize_model()
     def _initialize_model(self):
+        """Khởi tạo VAD model từ SpeechBrain"""
         try:
+            from speechbrain.pretrained import VAD
+            print("🔄 Đang tải VAD model từ SpeechBrain...")
+            self.model = VAD.from_hparams(
                 source=settings.VAD_MODEL,
+                savedir=f"/tmp/{settings.VAD_MODEL.replace('/', '_')}"
             )
+            print("✅ Đã tải VAD model thành công")
         except Exception as e:
+            print(f"❌ Lỗi tải VAD model: {e}")
+            self.model = None
+    def start_stream(self, speech_callback: Callable):
+        """Bắt đầu stream với VAD"""
+        if self.model is None:
+            print("❌ VAD model chưa được khởi tạo")
+            return False
+        self.is_streaming = True
+        self.speech_callback = speech_callback
+        self.audio_buffer = []
+        print("🎙️ Bắt đầu VAD streaming...")
+        return True
+    def stop_stream(self):
+        """Dừng stream"""
+        self.is_streaming = False
+        self.speech_callback = None
+        self.audio_buffer = []
+        print("🛑 Đã dừng VAD streaming")
+    def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
+        """Xử lý audio chunk với VAD"""
+        if not self.is_streaming or self.model is None:
+            return
+        try:
+            # Resample nếu cần
+            if sample_rate != self.sample_rate:
+                audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
+            # Thêm vào buffer
+            self.audio_buffer.extend(audio_chunk)
+            # Xử lý khi buffer đủ lớn (2 giây)
+            buffer_duration = len(self.audio_buffer) / self.sample_rate
+            if buffer_duration >= 2.0:
+                self._process_buffer()
+        except Exception as e:
+            print(f"❌ Lỗi xử lý VAD: {e}")
+    def _process_buffer(self):
+        """Xử lý buffer audio với VAD"""
+        try:
+            # Chuyển buffer thành tensor
+            audio_tensor = torch.FloatTensor(self.audio_buffer).unsqueeze(0)
+            # Phát hiện speech với VAD
+            boundaries = self.model.get_speech_segments(
+                audio_tensor,
+                # Điều chỉnh parameters để nhạy hơn
+                threshold=settings.VAD_THRESHOLD - 0.1,  # Giảm threshold
+                min_silence_duration=settings.VAD_MIN_SILENCE_DURATION + 0.3,  # Tăng silence duration
+                speech_pad_duration=settings.VAD_SPEECH_PAD_DURATION
             )
+            # Xử lý speech segments
+            if len(boundaries) > 0:
+                for start, end in boundaries:
+                    start_sample = int(start * self.sample_rate)
+                    end_sample = int(end * self.sample_rate)
+                    # Trích xuất speech segment
+                    speech_audio = np.array(self.audio_buffer[start_sample:end_sample])
+                    if len(speech_audio) > self.sample_rate * 0.5:  # Ít nhất 0.5 giây
+                        print(f"🎯 VAD phát hiện speech: {len(speech_audio)/self.sample_rate:.2f}s")
+                        # Gọi callback với speech segment
+                        if self.speech_callback:
+                            self.speech_callback(speech_audio, self.sample_rate)
+            # Giữ lại 0.5 giây cuối để overlap
+            keep_samples = int(self.sample_rate * 0.5)
+            if len(self.audio_buffer) > keep_samples:
+                self.audio_buffer = self.audio_buffer[-keep_samples:]
+            else:
+                self.audio_buffer = []
+        except Exception as e:
+            print(f"❌ Lỗi xử lý VAD buffer: {e}")
+            self.audio_buffer = []
+    def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio nếu cần"""
+        if orig_sr == target_sr:
+            return audio
         try:
+            audio_tensor = torch.FloatTensor(audio).unsqueeze(0)
+            resampler = torchaudio.transforms.Resample(orig_sr, target_sr)
+            resampled = resampler(audio_tensor)
+            return resampled.squeeze(0).numpy()
+        except Exception as e:
+            print(f"⚠️ Lỗi resample: {e}")
+            return audio
+    def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
+        """Kiểm tra xem audio chunk có phải là speech không"""
+        if self.model is None:
+            return True  # Fallback: luôn coi là speech
+        try:
+            # Resample nếu cần
+            if sample_rate != self.sample_rate:
+                audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
+            # Chuyển thành tensor
+            audio_tensor = torch.FloatTensor(audio_chunk).unsqueeze(0)
+            # Phát hiện speech
+            prob_speech = self.model.get_speech_prob_chunk(audio_tensor)
+            # Kiểm tra ngưỡng
+            return prob_speech.mean().item() > (settings.VAD_THRESHOLD - 0.1)
         except Exception as e:
+            print(f"❌ Lỗi kiểm tra speech: {e}")
+            return True