datbkpro commited on
Commit
a00cee2
·
verified ·
1 Parent(s): 1b5b7a4

Update core/silero_vad.py

Browse files
Files changed (1) hide show
  1. core/silero_vad.py +391 -152
core/silero_vad.py CHANGED
@@ -1,26 +1,306 @@
1
 
2
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
- from typing import Callable
5
- from config.settings import settings
6
- import os
7
  import time
 
 
 
 
 
 
8
 
9
-
10
- class SileroVAD:
11
  def __init__(self):
12
  self.model = None
13
- self.utils = None
14
  self.sample_rate = 16000
15
  self.is_streaming = False
16
  self.speech_callback = None
17
  self.audio_buffer = []
 
 
18
  self.speech_start_time = 0
19
- self.min_speech_duration = 0.5 # Giây
 
 
 
 
 
 
 
 
20
 
21
- # Thêm cấu hình chunk size cho Silero
22
- self.chunk_size = 512 # Silero yêu cầu 512 samples cho 16000Hz
23
- self.chunk_duration = self.chunk_size / self.sample_rate # 0.032 giây
24
 
25
  self._initialize_model()
26
 
@@ -28,52 +308,32 @@ class SileroVAD:
28
  """Khởi tạo Silero VAD model"""
29
  try:
30
  print("🔄 Đang tải Silero VAD model...")
31
-
32
- self.model, self.utils = torch.hub.load(
33
  repo_or_dir='snakers4/silero-vad',
34
  model='silero_vad',
35
  force_reload=False,
36
  trust_repo=True
37
  )
38
-
39
  self.model.eval()
40
  print("✅ Đã tải Silero VAD model thành công")
41
-
42
  except Exception as e:
43
  print(f"❌ Lỗi tải Silero VAD model: {e}")
44
- self._initialize_model_fallback()
45
-
46
- def _initialize_model_fallback(self):
47
- """Fallback nếu torch.hub.load thất bại"""
48
- try:
49
- model_dir = torch.hub.get_dir()
50
- model_path = os.path.join(
51
- model_dir, 'snakers4_silero-vad_master', 'files', 'silero_vad.jit'
52
- )
53
-
54
- if os.path.exists(model_path):
55
- self.model = torch.jit.load(model_path)
56
- self.model.eval()
57
- print("✅ Đã tải Silero VAD model thành công (fallback)")
58
- else:
59
- print("❌ Không tìm thấy model file (fallback thất bại)")
60
- self.model = None
61
-
62
- except Exception as e:
63
- print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
64
  self.model = None
65
 
66
  def start_stream(self, speech_callback: Callable):
67
  """Bắt đầu stream với VAD"""
68
  if self.model is None:
69
- print("❌ Silero VAD model chưa được khởi tạo")
70
  return False
71
 
72
  self.is_streaming = True
73
  self.speech_callback = speech_callback
74
  self.audio_buffer = []
 
 
 
75
  self.speech_start_time = 0
76
- print("🎙️ Bắt đầu Silero VAD streaming...")
 
77
  return True
78
 
79
  def stop_stream(self):
@@ -81,11 +341,13 @@ class SileroVAD:
81
  self.is_streaming = False
82
  self.speech_callback = None
83
  self.audio_buffer = []
84
- self.speech_start_time = 0
85
- print("🛑 Đã dừng Silero VAD streaming")
 
 
86
 
87
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
88
- """Xử lý audio chunk với Silero VAD - ĐÃ SỬA LỖI"""
89
  if not self.is_streaming or self.model is None:
90
  return
91
 
@@ -94,71 +356,96 @@ class SileroVAD:
94
  if sample_rate != self.sample_rate:
95
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
96
 
97
- # Thêm vào buffer
98
  self.audio_buffer.extend(audio_chunk)
99
 
100
- # Xử lý từng chunk 512 samples (Silero requirement)
101
  while len(self.audio_buffer) >= self.chunk_size:
102
  chunk = self.audio_buffer[:self.chunk_size]
103
- self._process_single_chunk(np.array(chunk))
104
- # Giữ lại phần thừa cho chunk tiếp theo
105
  self.audio_buffer = self.audio_buffer[self.chunk_size:]
106
 
107
  except Exception as e:
108
- print(f"❌ Lỗi xử lý Silero VAD: {e}")
109
 
110
- def _process_single_chunk(self, audio_chunk: np.ndarray):
111
- """Xử lý một chunk 512 samples duy nhất"""
112
- try:
113
- # Chuẩn hóa audio
114
- audio_chunk = self._normalize_audio(audio_chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- # Đảm bảo đúng kích thước
117
- if len(audio_chunk) != self.chunk_size:
118
- # Nếu không đủ, pad với zeros
119
- if len(audio_chunk) < self.chunk_size:
120
- padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
121
- audio_chunk = np.concatenate([audio_chunk, padding])
122
- else:
123
- audio_chunk = audio_chunk[:self.chunk_size]
124
-
125
- # Dự đoán xác suất speech
126
- speech_prob = self._get_speech_probability(audio_chunk)
127
- print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
128
-
129
- # Xử lý logic speech detection
130
- current_time = time.time()
131
 
132
- if speech_prob > settings.VAD_THRESHOLD:
133
- if self.speech_start_time == 0:
134
- self.speech_start_time = current_time
135
- print("🎯 Bắt đầu phát hiện speech")
136
-
137
- speech_duration = current_time - self.speech_start_time
 
 
 
 
 
 
 
138
 
139
- # Nếu đủ thời gian speech, gọi callback
140
- if speech_duration >= self.min_speech_duration:
141
- if self.speech_callback:
142
- # Thu thập tất cả audio từ khi bắt đầu speech
143
- full_audio = self._collect_speech_audio()
144
- if len(full_audio) > 0:
145
- self.speech_callback(full_audio, self.sample_rate)
146
- self.speech_start_time = 0
147
- else:
148
- if self.speech_start_time > 0:
149
- print("🔇 Kết thúc speech segment")
150
- self.speech_start_time = 0
151
-
152
- except Exception as e:
153
- print(f"❌ Lỗi xử lý Silero VAD chunk: {e}")
154
-
155
- def _collect_speech_audio(self) -> np.ndarray:
156
- """Thu thập toàn bộ audio từ khi bắt đầu speech"""
157
- # Trong implementation thực tế, bạn cần lưu lại audio
158
- # từ khi bắt đầu phát hiện speech đến hiện tại
159
- # Đây là simplified version
160
- min_samples = int(self.sample_rate * self.min_speech_duration)
161
- return np.random.randn(min_samples).astype(np.float32) # Placeholder
 
 
 
 
 
 
 
162
 
163
  def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
164
  """Chuẩn hóa audio"""
@@ -169,24 +456,16 @@ class SileroVAD:
169
  return np.clip(audio, -1.0, 1.0)
170
 
171
  def _get_speech_probability(self, audio_chunk: np.ndarray) -> float:
172
- """Trả về xác suất speech - ĐÃ SỬA LỖI"""
173
  try:
174
- # ✅ Đảm bảo đúng kích thước 512 samples
175
  if len(audio_chunk) != self.chunk_size:
176
- # Resize về đúng 512 samples
177
- if len(audio_chunk) > self.chunk_size:
178
- audio_chunk = audio_chunk[:self.chunk_size]
179
- else:
180
- padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
181
- audio_chunk = np.concatenate([audio_chunk, padding])
182
-
183
  audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
184
-
185
  with torch.no_grad():
186
  return self.model(audio_tensor, self.sample_rate).item()
187
-
188
  except Exception as e:
189
- print(f"❌ Lỗi lấy speech probability: {e}")
190
  return 0.0
191
 
192
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
@@ -195,64 +474,24 @@ class SileroVAD:
195
  return audio
196
  try:
197
  from scipy import signal
198
- # Tính số samples mới
199
  duration = len(audio) / orig_sr
200
  new_length = int(duration * target_sr)
201
-
202
- # Resample
203
  resampled_audio = signal.resample(audio, new_length)
204
  return resampled_audio.astype(np.float32)
205
-
206
- except ImportError:
207
- # Fallback simple resampling
208
- orig_len = len(audio)
209
- new_len = int(orig_len * target_sr / orig_sr)
210
- x_old = np.linspace(0, 1, orig_len)
211
- x_new = np.linspace(0, 1, new_len)
212
- return np.interp(x_new, x_old, audio).astype(np.float32)
213
- except Exception as e:
214
- print(f"⚠️ Lỗi resample: {e}")
215
  return audio
216
 
217
  def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
218
- """Kiểm tra chunk có phải speech không - ĐÃ SỬA"""
219
  if self.model is None:
220
  return True
221
- try:
222
- if sample_rate != self.sample_rate:
223
- audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
224
- audio_chunk = self._normalize_audio(audio_chunk)
225
-
226
- # ✅ Chia thành các chunk 512 samples và kiểm tra trung bình
227
- chunk_size = 512
228
- speech_probs = []
229
-
230
- for i in range(0, len(audio_chunk), chunk_size):
231
- chunk = audio_chunk[i:i+chunk_size]
232
- if len(chunk) == chunk_size:
233
- prob = self._get_speech_probability(chunk)
234
- speech_probs.append(prob)
235
 
236
- if not speech_probs:
237
- return False
238
-
239
- avg_prob = np.mean(speech_probs)
240
- return avg_prob > settings.VAD_THRESHOLD
241
-
242
- except Exception as e:
243
- print(f"❌ Lỗi kiểm tra speech: {e}")
244
- return True
245
-
246
- def get_speech_probability(self, audio_chunk: np.ndarray, sample_rate: int) -> float:
247
- """Lấy xác suất speech trung bình"""
248
- if self.model is None:
249
- return 0.0
250
  try:
251
  if sample_rate != self.sample_rate:
252
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
253
  audio_chunk = self._normalize_audio(audio_chunk)
254
 
255
- # Chia thành các chunk 512 samples
256
  chunk_size = 512
257
  speech_probs = []
258
 
@@ -262,8 +501,8 @@ class SileroVAD:
262
  prob = self._get_speech_probability(chunk)
263
  speech_probs.append(prob)
264
 
265
- return np.mean(speech_probs) if speech_probs else 0.0
266
 
267
  except Exception as e:
268
- print(f"❌ Lỗi lấy speech probability: {e}")
269
- return 0.0
 
1
 
2
+ # import torch
3
+ # import numpy as np
4
+ # from typing import Callable
5
+ # from config.settings import settings
6
+ # import os
7
+ # import time
8
+
9
+
10
+ # class SileroVAD:
11
+ # def __init__(self):
12
+ # self.model = None
13
+ # self.utils = None
14
+ # self.sample_rate = 16000
15
+ # self.is_streaming = False
16
+ # self.speech_callback = None
17
+ # self.audio_buffer = []
18
+ # self.speech_start_time = 0
19
+ # self.min_speech_duration = 0.5 # Giây
20
+
21
+ # # ✅ Thêm cấu hình chunk size cho Silero
22
+ # self.chunk_size = 512 # Silero yêu cầu 512 samples cho 16000Hz
23
+ # self.chunk_duration = self.chunk_size / self.sample_rate # 0.032 giây
24
+
25
+ # self._initialize_model()
26
+
27
+ # def _initialize_model(self):
28
+ # """Khởi tạo Silero VAD model"""
29
+ # try:
30
+ # print("🔄 Đang tải Silero VAD model...")
31
+
32
+ # self.model, self.utils = torch.hub.load(
33
+ # repo_or_dir='snakers4/silero-vad',
34
+ # model='silero_vad',
35
+ # force_reload=False,
36
+ # trust_repo=True
37
+ # )
38
+
39
+ # self.model.eval()
40
+ # print("✅ Đã tải Silero VAD model thành công")
41
+
42
+ # except Exception as e:
43
+ # print(f"❌ Lỗi tải Silero VAD model: {e}")
44
+ # self._initialize_model_fallback()
45
+
46
+ # def _initialize_model_fallback(self):
47
+ # """Fallback nếu torch.hub.load thất bại"""
48
+ # try:
49
+ # model_dir = torch.hub.get_dir()
50
+ # model_path = os.path.join(
51
+ # model_dir, 'snakers4_silero-vad_master', 'files', 'silero_vad.jit'
52
+ # )
53
+
54
+ # if os.path.exists(model_path):
55
+ # self.model = torch.jit.load(model_path)
56
+ # self.model.eval()
57
+ # print("✅ Đã tải Silero VAD model thành công (fallback)")
58
+ # else:
59
+ # print("❌ Không tìm thấy model file (fallback thất bại)")
60
+ # self.model = None
61
+
62
+ # except Exception as e:
63
+ # print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
64
+ # self.model = None
65
+
66
+ # def start_stream(self, speech_callback: Callable):
67
+ # """Bắt đầu stream với VAD"""
68
+ # if self.model is None:
69
+ # print("❌ Silero VAD model chưa được khởi tạo")
70
+ # return False
71
+
72
+ # self.is_streaming = True
73
+ # self.speech_callback = speech_callback
74
+ # self.audio_buffer = []
75
+ # self.speech_start_time = 0
76
+ # print("🎙️ Bắt đầu Silero VAD streaming...")
77
+ # return True
78
+
79
+ # def stop_stream(self):
80
+ # """Dừng stream"""
81
+ # self.is_streaming = False
82
+ # self.speech_callback = None
83
+ # self.audio_buffer = []
84
+ # self.speech_start_time = 0
85
+ # print("🛑 Đã dừng Silero VAD streaming")
86
+
87
+ # def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
88
+ # """Xử lý audio chunk với Silero VAD - ĐÃ SỬA LỖI"""
89
+ # if not self.is_streaming or self.model is None:
90
+ # return
91
+
92
+ # try:
93
+ # # Resample nếu cần
94
+ # if sample_rate != self.sample_rate:
95
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
96
+
97
+ # # Thêm vào buffer
98
+ # self.audio_buffer.extend(audio_chunk)
99
+
100
+ # # ✅ Xử lý từng chunk 512 samples (Silero requirement)
101
+ # while len(self.audio_buffer) >= self.chunk_size:
102
+ # chunk = self.audio_buffer[:self.chunk_size]
103
+ # self._process_single_chunk(np.array(chunk))
104
+ # # Giữ lại phần thừa cho chunk tiếp theo
105
+ # self.audio_buffer = self.audio_buffer[self.chunk_size:]
106
+
107
+ # except Exception as e:
108
+ # print(f"❌ Lỗi xử lý Silero VAD: {e}")
109
+
110
+ # def _process_single_chunk(self, audio_chunk: np.ndarray):
111
+ # """Xử lý một chunk 512 samples duy nhất"""
112
+ # try:
113
+ # # Chuẩn hóa audio
114
+ # audio_chunk = self._normalize_audio(audio_chunk)
115
+
116
+ # # Đảm bảo đúng kích thước
117
+ # if len(audio_chunk) != self.chunk_size:
118
+ # # Nếu không đủ, pad với zeros
119
+ # if len(audio_chunk) < self.chunk_size:
120
+ # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
121
+ # audio_chunk = np.concatenate([audio_chunk, padding])
122
+ # else:
123
+ # audio_chunk = audio_chunk[:self.chunk_size]
124
+
125
+ # # Dự đoán xác suất speech
126
+ # speech_prob = self._get_speech_probability(audio_chunk)
127
+ # print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
128
+
129
+ # # Xử lý logic speech detection
130
+ # current_time = time.time()
131
+
132
+ # if speech_prob > settings.VAD_THRESHOLD:
133
+ # if self.speech_start_time == 0:
134
+ # self.speech_start_time = current_time
135
+ # print("🎯 Bắt đầu phát hiện speech")
136
+
137
+ # speech_duration = current_time - self.speech_start_time
138
+
139
+ # # Nếu đủ thời gian speech, gọi callback
140
+ # if speech_duration >= self.min_speech_duration:
141
+ # if self.speech_callback:
142
+ # # Thu thập tất cả audio từ khi bắt đầu speech
143
+ # full_audio = self._collect_speech_audio()
144
+ # if len(full_audio) > 0:
145
+ # self.speech_callback(full_audio, self.sample_rate)
146
+ # self.speech_start_time = 0
147
+ # else:
148
+ # if self.speech_start_time > 0:
149
+ # print("🔇 Kết thúc speech segment")
150
+ # self.speech_start_time = 0
151
+
152
+ # except Exception as e:
153
+ # print(f"❌ Lỗi xử lý Silero VAD chunk: {e}")
154
+
155
+ # def _collect_speech_audio(self) -> np.ndarray:
156
+ # """Thu thập toàn bộ audio từ khi bắt đầu speech"""
157
+ # # Trong implementation thực tế, bạn cần lưu lại audio
158
+ # # từ khi bắt đầu phát hiện speech đến hiện tại
159
+ # # Đây là simplified version
160
+ # min_samples = int(self.sample_rate * self.min_speech_duration)
161
+ # return np.random.randn(min_samples).astype(np.float32) # Placeholder
162
+
163
+ # def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
164
+ # """Chuẩn hóa audio"""
165
+ # if audio.dtype != np.float32:
166
+ # audio = audio.astype(np.float32)
167
+ # if np.max(np.abs(audio)) > 1.0:
168
+ # audio = audio / 32768.0
169
+ # return np.clip(audio, -1.0, 1.0)
170
+
171
+ # def _get_speech_probability(self, audio_chunk: np.ndarray) -> float:
172
+ # """Trả về xác suất speech - ĐÃ SỬA LỖI"""
173
+ # try:
174
+ # # ✅ Đảm bảo đúng kích thước 512 samples
175
+ # if len(audio_chunk) != self.chunk_size:
176
+ # # Resize về đúng 512 samples
177
+ # if len(audio_chunk) > self.chunk_size:
178
+ # audio_chunk = audio_chunk[:self.chunk_size]
179
+ # else:
180
+ # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
181
+ # audio_chunk = np.concatenate([audio_chunk, padding])
182
+
183
+ # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
184
+
185
+ # with torch.no_grad():
186
+ # return self.model(audio_tensor, self.sample_rate).item()
187
+
188
+ # except Exception as e:
189
+ # print(f"❌ Lỗi lấy speech probability: {e}")
190
+ # return 0.0
191
+
192
+ # def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
193
+ # """Resample audio"""
194
+ # if orig_sr == target_sr:
195
+ # return audio
196
+ # try:
197
+ # from scipy import signal
198
+ # # Tính số samples mới
199
+ # duration = len(audio) / orig_sr
200
+ # new_length = int(duration * target_sr)
201
+
202
+ # # Resample
203
+ # resampled_audio = signal.resample(audio, new_length)
204
+ # return resampled_audio.astype(np.float32)
205
+
206
+ # except ImportError:
207
+ # # Fallback simple resampling
208
+ # orig_len = len(audio)
209
+ # new_len = int(orig_len * target_sr / orig_sr)
210
+ # x_old = np.linspace(0, 1, orig_len)
211
+ # x_new = np.linspace(0, 1, new_len)
212
+ # return np.interp(x_new, x_old, audio).astype(np.float32)
213
+ # except Exception as e:
214
+ # print(f"⚠️ Lỗi resample: {e}")
215
+ # return audio
216
+
217
+ # def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
218
+ # """Kiểm tra chunk có phải speech không - ĐÃ SỬA"""
219
+ # if self.model is None:
220
+ # return True
221
+ # try:
222
+ # if sample_rate != self.sample_rate:
223
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
224
+ # audio_chunk = self._normalize_audio(audio_chunk)
225
+
226
+ # # ✅ Chia thành các chunk 512 samples và kiểm tra trung bình
227
+ # chunk_size = 512
228
+ # speech_probs = []
229
+
230
+ # for i in range(0, len(audio_chunk), chunk_size):
231
+ # chunk = audio_chunk[i:i+chunk_size]
232
+ # if len(chunk) == chunk_size:
233
+ # prob = self._get_speech_probability(chunk)
234
+ # speech_probs.append(prob)
235
+
236
+ # if not speech_probs:
237
+ # return False
238
+
239
+ # avg_prob = np.mean(speech_probs)
240
+ # return avg_prob > settings.VAD_THRESHOLD
241
+
242
+ # except Exception as e:
243
+ # print(f"❌ Lỗi kiểm tra speech: {e}")
244
+ # return True
245
+
246
+ # def get_speech_probability(self, audio_chunk: np.ndarray, sample_rate: int) -> float:
247
+ # """Lấy xác suất speech trung bình"""
248
+ # if self.model is None:
249
+ # return 0.0
250
+ # try:
251
+ # if sample_rate != self.sample_rate:
252
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
253
+ # audio_chunk = self._normalize_audio(audio_chunk)
254
+
255
+ # # Chia thành các chunk 512 samples
256
+ # chunk_size = 512
257
+ # speech_probs = []
258
+
259
+ # for i in range(0, len(audio_chunk), chunk_size):
260
+ # chunk = audio_chunk[i:i+chunk_size]
261
+ # if len(chunk) == chunk_size:
262
+ # prob = self._get_speech_probability(chunk)
263
+ # speech_probs.append(prob)
264
+
265
+ # return np.mean(speech_probs) if speech_probs else 0.0
266
+
267
+ # except Exception as e:
268
+ # print(f"❌ Lỗi lấy speech probability: {e}")
269
+ # return 0.0
270
+ import io
271
  import numpy as np
272
+ import soundfile as sf
 
 
273
  import time
274
+ import traceback
275
+ import threading
276
+ import queue
277
+ from groq import Groq
278
+ from typing import Optional, Dict, Any, Callable
279
+ from config.settings import settings
280
 
281
+ class OptimizedSileroVAD:
 
282
  def __init__(self):
283
  self.model = None
 
284
  self.sample_rate = 16000
285
  self.is_streaming = False
286
  self.speech_callback = None
287
  self.audio_buffer = []
288
+ self.speech_buffer = [] # Buffer cho speech đang diễn ra
289
+ self.state = "silence" # silence, speech, processing
290
  self.speech_start_time = 0
291
+ self.last_voice_time = 0
292
+
293
+ # Cấu hình tối ưu
294
+ self.chunk_size = 512
295
+ self.speech_threshold = settings.VAD_THRESHOLD
296
+ self.min_speech_duration = settings.VAD_MIN_SPEECH_DURATION
297
+ self.min_silence_duration = settings.VAD_MIN_SILENCE_DURATION
298
+ self.speech_pad_duration = settings.VAD_SPEECH_PAD_DURATION
299
+ self.pre_speech_buffer = settings.VAD_PRE_SPEECH_BUFFER
300
 
301
+ # Buffer cho pre-speech
302
+ self.pre_speech_samples = int(self.pre_speech_buffer * self.sample_rate)
303
+ self.pre_speech_buffer = []
304
 
305
  self._initialize_model()
306
 
 
308
  """Khởi tạo Silero VAD model"""
309
  try:
310
  print("🔄 Đang tải Silero VAD model...")
311
+ self.model, utils = torch.hub.load(
 
312
  repo_or_dir='snakers4/silero-vad',
313
  model='silero_vad',
314
  force_reload=False,
315
  trust_repo=True
316
  )
 
317
  self.model.eval()
318
  print("✅ Đã tải Silero VAD model thành công")
 
319
  except Exception as e:
320
  print(f"❌ Lỗi tải Silero VAD model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  self.model = None
322
 
323
  def start_stream(self, speech_callback: Callable):
324
  """Bắt đầu stream với VAD"""
325
  if self.model is None:
 
326
  return False
327
 
328
  self.is_streaming = True
329
  self.speech_callback = speech_callback
330
  self.audio_buffer = []
331
+ self.speech_buffer = []
332
+ self.pre_speech_buffer = []
333
+ self.state = "silence"
334
  self.speech_start_time = 0
335
+ self.last_voice_time = 0
336
+ print("🎙️ Bắt đầu VAD streaming với cấu hình tối ưu...")
337
  return True
338
 
339
  def stop_stream(self):
 
341
  self.is_streaming = False
342
  self.speech_callback = None
343
  self.audio_buffer = []
344
+ self.speech_buffer = []
345
+ self.pre_speech_buffer = []
346
+ self.state = "silence"
347
+ print("🛑 Đã dừng VAD streaming")
348
 
349
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
350
+ """Xử lý audio chunk với VAD tối ưu"""
351
  if not self.is_streaming or self.model is None:
352
  return
353
 
 
356
  if sample_rate != self.sample_rate:
357
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
358
 
359
+ # Thêm vào buffer chính
360
  self.audio_buffer.extend(audio_chunk)
361
 
362
+ # Xử lý từng chunk
363
  while len(self.audio_buffer) >= self.chunk_size:
364
  chunk = self.audio_buffer[:self.chunk_size]
365
+ self._process_vad_chunk(np.array(chunk))
 
366
  self.audio_buffer = self.audio_buffer[self.chunk_size:]
367
 
368
  except Exception as e:
369
+ print(f"❌ Lỗi xử lý VAD: {e}")
370
 
371
+ def _process_vad_chunk(self, audio_chunk: np.ndarray):
372
+ """Xử lý VAD cho một chunk - TỐI ƯU HÓA"""
373
+ current_time = time.time()
374
+
375
+ # Chuẩn hóa audio
376
+ audio_chunk = self._normalize_audio(audio_chunk)
377
+
378
+ # Lấy xác suất speech
379
+ speech_prob = self._get_speech_probability(audio_chunk)
380
+
381
+ # Logic state machine cải tiến
382
+ if self.state == "silence":
383
+ if speech_prob > self.speech_threshold:
384
+ print("🎯 Bắt đầu phát hiện speech")
385
+ self.state = "speech"
386
+ self.speech_start_time = current_time
387
+ self.last_voice_time = current_time
388
+ # Khởi tạo speech buffer với pre-speech data
389
+ self.speech_buffer = self.pre_speech_buffer.copy()
390
+ self.speech_buffer.extend(audio_chunk)
391
+ else:
392
+ # Lưu pre-speech buffer (giới hạn kích thước)
393
+ self.pre_speech_buffer.extend(audio_chunk)
394
+ if len(self.pre_speech_buffer) > self.pre_speech_samples:
395
+ self.pre_speech_buffer = self.pre_speech_buffer[-self.pre_speech_samples:]
396
+
397
+ elif self.state == "speech":
398
+ # Luôn thêm vào speech buffer
399
+ self.speech_buffer.extend(audio_chunk)
400
 
401
+ # Cập nhật thời gian voice cuối cùng
402
+ if speech_prob > self.speech_threshold:
403
+ self.last_voice_time = current_time
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ # Kiểm tra kết thúc speech
406
+ silence_duration = current_time - self.last_voice_time
407
+ speech_duration = current_time - self.speech_start_time
408
+
409
+ # Điều kiện kết thúc: im lặng đủ lâu VÀ đã nói đủ dài
410
+ if (silence_duration >= self.min_silence_duration and
411
+ speech_duration >= self.min_speech_duration):
412
+ print(f"🔇 Kết thúc speech segment (duration: {speech_duration:.2f}s)")
413
+ self._finalize_speech()
414
+ # Hoặc speech quá dài (timeout)
415
+ elif speech_duration > settings.MAX_AUDIO_DURATION:
416
+ print(f"⏰ Speech timeout ({speech_duration:.2f}s)")
417
+ self._finalize_speech()
418
 
419
+ elif self.state == "processing":
420
+ # Đang xử lý, không nhận thêm audio
421
+ pass
422
+
423
+ def _finalize_speech(self):
424
+ """Hoàn thành xử lý speech segment"""
425
+ if not self.speech_buffer or len(self.speech_buffer) == 0:
426
+ self.state = "silence"
427
+ return
428
+
429
+ # Chuyển sang state processing để tránh nhận thêm audio
430
+ self.state = "processing"
431
+
432
+ # Tạo audio array từ buffer
433
+ speech_audio = np.array(self.speech_buffer, dtype=np.float32)
434
+
435
+ # Gọi callback trong thread riêng
436
+ if self.speech_callback:
437
+ threading.Thread(
438
+ target=self.speech_callback,
439
+ args=(speech_audio, self.sample_rate),
440
+ daemon=True
441
+ ).start()
442
+
443
+ # Reset buffers nhưng giữ pre-speech
444
+ self.speech_buffer = []
445
+ self.audio_buffer = []
446
+
447
+ # Quay lại state silence sau khi xử lý
448
+ self.state = "silence"
449
 
450
  def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
451
  """Chuẩn hóa audio"""
 
456
  return np.clip(audio, -1.0, 1.0)
457
 
458
  def _get_speech_probability(self, audio_chunk: np.ndarray) -> float:
459
+ """Lấy xác suất speech"""
460
  try:
 
461
  if len(audio_chunk) != self.chunk_size:
462
+ return 0.0
463
+
 
 
 
 
 
464
  audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
 
465
  with torch.no_grad():
466
  return self.model(audio_tensor, self.sample_rate).item()
 
467
  except Exception as e:
468
+ print(f"❌ Lỗi speech probability: {e}")
469
  return 0.0
470
 
471
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
 
474
  return audio
475
  try:
476
  from scipy import signal
 
477
  duration = len(audio) / orig_sr
478
  new_length = int(duration * target_sr)
 
 
479
  resampled_audio = signal.resample(audio, new_length)
480
  return resampled_audio.astype(np.float32)
481
+ except Exception:
 
 
 
 
 
 
 
 
 
482
  return audio
483
 
484
  def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
485
+ """Kiểm tra speech (cho compatibility)"""
486
  if self.model is None:
487
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  try:
490
  if sample_rate != self.sample_rate:
491
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
492
  audio_chunk = self._normalize_audio(audio_chunk)
493
 
494
+ # Kiểm tra multiple chunks
495
  chunk_size = 512
496
  speech_probs = []
497
 
 
501
  prob = self._get_speech_probability(chunk)
502
  speech_probs.append(prob)
503
 
504
+ return np.mean(speech_probs) > self.speech_threshold if speech_probs else False
505
 
506
  except Exception as e:
507
+ print(f"❌ Lỗi kiểm tra speech: {e}")
508
+ return True