datbkpro commited on
Commit
56d3d8c
·
verified ·
1 Parent(s): ff9d355

Update core/silero_vad.py

Browse files
Files changed (1) hide show
  1. core/silero_vad.py +338 -97
core/silero_vad.py CHANGED
@@ -1,8 +1,248 @@
1
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  from typing import Optional, Callable
4
  from config.settings import settings
5
  import os
 
6
 
7
  class SileroVAD:
8
  def __init__(self):
@@ -11,52 +251,45 @@ class SileroVAD:
11
  self.is_streaming = False
12
  self.speech_callback = None
13
  self.audio_buffer = []
 
 
14
  self._initialize_model()
15
 
16
  def _initialize_model(self):
17
- """Khởi tạo Silero VAD model sử dụng torch.hub"""
18
  try:
19
- print("🔄 Đang tải Silero VAD model từ torch.hub...")
20
 
21
- # Sử dụng torch.hub để load model (cách chính thức)
22
  self.model = torch.hub.load(
23
- repo_or_dir=settings.VAD_MODEL,
24
  model='silero_vad',
25
- force_reload=False, # Sử dụng cache nếu có
26
  trust_repo=True
27
  )
28
 
 
29
  print("✅ Đã tải Silero VAD model thành công")
30
 
31
  except Exception as e:
32
  print(f"❌ Lỗi tải Silero VAD model: {e}")
33
- print("🔄 Đang thử cách tải thay thế...")
34
  self._initialize_model_fallback()
35
 
36
  def _initialize_model_fallback(self):
37
- """Fallback method nếu cách chính thức không hoạt động"""
38
  try:
39
- # Cách 2: Sử dụng direct download
40
- model_urls = {
41
- 'silero_vad.jit': 'https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.jit'
42
- }
43
-
44
- # Tạo thư mục cache
45
- os.makedirs('./models', exist_ok=True)
46
- model_path = './models/silero_vad.jit'
47
-
48
- if not os.path.exists(model_path):
49
- print("📥 Đang download Silero VAD model...")
50
- torch.hub.download_url_to_file(
51
- model_urls['silero_vad.jit'],
52
- model_path
53
- )
54
-
55
- # Load model
56
- self.model = torch.jit.load(model_path)
57
- self.model.eval()
58
- print("✅ Đã tải Silero VAD model thành công (fallback)")
59
 
 
 
 
 
 
 
 
 
60
  except Exception as e:
61
  print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
62
  self.model = None
@@ -70,6 +303,7 @@ class SileroVAD:
70
  self.is_streaming = True
71
  self.speech_callback = speech_callback
72
  self.audio_buffer = []
 
73
  print("🎙️ Bắt đầu Silero VAD streaming...")
74
  return True
75
 
@@ -78,10 +312,11 @@ class SileroVAD:
78
  self.is_streaming = False
79
  self.speech_callback = None
80
  self.audio_buffer = []
 
81
  print("🛑 Đã dừng Silero VAD streaming")
82
 
83
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
84
- """Xử lý audio chunk với Silero VAD"""
85
  if not self.is_streaming or self.model is None:
86
  return
87
 
@@ -93,72 +328,108 @@ class SileroVAD:
93
  # Thêm vào buffer
94
  self.audio_buffer.extend(audio_chunk)
95
 
96
- # Xử lý khi buffer đủ lớn (1 giây)
97
  buffer_duration = len(self.audio_buffer) / self.sample_rate
98
- if buffer_duration >= 1.0:
99
  self._process_buffer()
100
 
101
  except Exception as e:
102
  print(f"❌ Lỗi xử lý Silero VAD: {e}")
103
 
104
  def _process_buffer(self):
105
- """Xử lý buffer audio với Silero VAD"""
106
  try:
107
- chunk_size = self.sample_rate # 1 giây
108
  if len(self.audio_buffer) < chunk_size:
109
  return
110
 
111
- # Lấy chunk 1 giây
112
  audio_chunk = np.array(self.audio_buffer[:chunk_size])
113
 
114
- # Chuẩn hóa audio cho Silero
115
- if audio_chunk.dtype != np.float32:
116
- audio_chunk = audio_chunk.astype(np.float32)
117
- if np.max(np.abs(audio_chunk)) > 1.0:
118
- audio_chunk = audio_chunk / 32768.0 # Normalize từ int16
119
-
120
- # Đảm bảo audio trong range [-1, 1]
121
- audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
122
-
123
- # Chuyển thành tensor
124
- audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
125
 
126
- # Phát hiện speech với Silero VAD
127
- with torch.no_grad():
128
- speech_prob = self.model(audio_tensor, self.sample_rate).item()
129
 
130
  print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
131
 
132
  # Ngưỡng phát hiện speech
133
  if speech_prob > settings.VAD_THRESHOLD:
134
- print(f"🎯 Silero VAD phát hiện speech: {speech_prob:.3f}")
 
 
 
 
135
 
136
- # Gọi callback với speech segment
137
- if self.speech_callback:
138
- self.speech_callback(audio_chunk, self.sample_rate)
139
-
140
- # Giữ lại 0.3 giây cuối để overlap
141
- keep_samples = int(self.sample_rate * 0.3)
142
- if len(self.audio_buffer) > keep_samples:
143
- self.audio_buffer = self.audio_buffer[-keep_samples:]
 
 
 
 
144
  else:
145
- self.audio_buffer = []
 
 
 
 
 
 
 
 
 
 
146
 
147
  except Exception as e:
148
  print(f"❌ Lỗi xử lý Silero VAD buffer: {e}")
149
  self.audio_buffer = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
152
- """Resample audio nếu cần"""
153
  if orig_sr == target_sr:
154
  return audio
155
 
156
  try:
157
- # Simple resampling bằng interpolation
158
  orig_length = len(audio)
159
  new_length = int(orig_length * target_sr / orig_sr)
160
 
161
- # Linear interpolation
162
  x_old = np.linspace(0, 1, orig_length)
163
  x_new = np.linspace(0, 1, new_length)
164
  resampled_audio = np.interp(x_new, x_old, audio)
@@ -171,7 +442,7 @@ class SileroVAD:
171
  def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
172
  """Kiểm tra xem audio chunk có phải là speech không"""
173
  if self.model is None:
174
- return True # Fallback: luôn coi là speech
175
 
176
  try:
177
  # Resample nếu cần
@@ -179,26 +450,11 @@ class SileroVAD:
179
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
180
 
181
  # Chuẩn hóa audio
182
- if audio_chunk.dtype != np.float32:
183
- audio_chunk = audio_chunk.astype(np.float32)
184
- if np.max(np.abs(audio_chunk)) > 1.0:
185
- audio_chunk = audio_chunk / 32768.0
186
-
187
- audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
188
 
189
- # Đảm bảo độ dài phù hợp
190
- if len(audio_chunk) < 512:
191
- padding = np.zeros(512 - len(audio_chunk), dtype=np.float32)
192
- audio_chunk = np.concatenate([audio_chunk, padding])
193
-
194
- # Chuyển thành tensor
195
- audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
196
-
197
- # Phát hiện speech
198
- with torch.no_grad():
199
- speech_prob = self.model(audio_tensor, self.sample_rate).item()
200
 
201
- # Kiểm tra ngưỡng
202
  return speech_prob > settings.VAD_THRESHOLD
203
 
204
  except Exception as e:
@@ -216,24 +472,9 @@ class SileroVAD:
216
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
217
 
218
  # Chuẩn hóa audio
219
- if audio_chunk.dtype != np.float32:
220
- audio_chunk = audio_chunk.astype(np.float32)
221
- if np.max(np.abs(audio_chunk)) > 1.0:
222
- audio_chunk = audio_chunk / 32768.0
223
-
224
- audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
225
-
226
- # Đảm bảo độ dài phù hợp
227
- if len(audio_chunk) < 512:
228
- padding = np.zeros(512 - len(audio_chunk), dtype=np.float32)
229
- audio_chunk = np.concatenate([audio_chunk, padding])
230
 
231
- # Chuyển thành tensor
232
- audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
233
-
234
- # Phát hiện speech
235
- with torch.no_grad():
236
- return self.model(audio_tensor, self.sample_rate).item()
237
 
238
  except Exception as e:
239
  print(f"❌ Lỗi lấy speech probability: {e}")
 
1
+ # import torch
2
+ # import numpy as np
3
+ # from typing import Optional, Callable
4
+ # from config.settings import settings
5
+ # import os
6
+
7
+ # class SileroVAD:
8
+ # def __init__(self):
9
+ # self.model = None
10
+ # self.sample_rate = 16000
11
+ # self.is_streaming = False
12
+ # self.speech_callback = None
13
+ # self.audio_buffer = []
14
+ # self._initialize_model()
15
+
16
+ # def _initialize_model(self):
17
+ # """Khởi tạo Silero VAD model sử dụng torch.hub"""
18
+ # try:
19
+ # print("🔄 Đang tải Silero VAD model từ torch.hub...")
20
+
21
+ # # Sử dụng torch.hub để load model (cách chính thức)
22
+ # self.model = torch.hub.load(
23
+ # repo_or_dir=settings.VAD_MODEL,
24
+ # model='silero_vad',
25
+ # force_reload=False, # Sử dụng cache nếu có
26
+ # trust_repo=True
27
+ # )
28
+
29
+ # print("✅ Đã tải Silero VAD model thành công")
30
+
31
+ # except Exception as e:
32
+ # print(f"❌ Lỗi tải Silero VAD model: {e}")
33
+ # print("🔄 Đang thử cách tải thay thế...")
34
+ # self._initialize_model_fallback()
35
+
36
+ # def _initialize_model_fallback(self):
37
+ # """Fallback method nếu cách chính thức không hoạt động"""
38
+ # try:
39
+ # # Cách 2: Sử dụng direct download
40
+ # model_urls = {
41
+ # 'silero_vad.jit': 'https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.jit'
42
+ # }
43
+
44
+ # # Tạo thư mục cache
45
+ # os.makedirs('./models', exist_ok=True)
46
+ # model_path = './models/silero_vad.jit'
47
+
48
+ # if not os.path.exists(model_path):
49
+ # print("📥 Đang download Silero VAD model...")
50
+ # torch.hub.download_url_to_file(
51
+ # model_urls['silero_vad.jit'],
52
+ # model_path
53
+ # )
54
+
55
+ # # Load model
56
+ # self.model = torch.jit.load(model_path)
57
+ # self.model.eval()
58
+ # print("✅ Đã tải Silero VAD model thành công (fallback)")
59
+
60
+ # except Exception as e:
61
+ # print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
62
+ # self.model = None
63
+
64
+ # def start_stream(self, speech_callback: Callable):
65
+ # """Bắt đầu stream với VAD"""
66
+ # if self.model is None:
67
+ # print("❌ Silero VAD model chưa được khởi tạo")
68
+ # return False
69
+
70
+ # self.is_streaming = True
71
+ # self.speech_callback = speech_callback
72
+ # self.audio_buffer = []
73
+ # print("🎙️ Bắt đầu Silero VAD streaming...")
74
+ # return True
75
+
76
+ # def stop_stream(self):
77
+ # """Dừng stream"""
78
+ # self.is_streaming = False
79
+ # self.speech_callback = None
80
+ # self.audio_buffer = []
81
+ # print("🛑 Đã dừng Silero VAD streaming")
82
+
83
+ # def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
84
+ # """Xử lý audio chunk với Silero VAD"""
85
+ # if not self.is_streaming or self.model is None:
86
+ # return
87
+
88
+ # try:
89
+ # # Resample nếu cần
90
+ # if sample_rate != self.sample_rate:
91
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
92
+
93
+ # # Thêm vào buffer
94
+ # self.audio_buffer.extend(audio_chunk)
95
+
96
+ # # Xử lý khi buffer đủ lớn (1 giây)
97
+ # buffer_duration = len(self.audio_buffer) / self.sample_rate
98
+ # if buffer_duration >= 1.0:
99
+ # self._process_buffer()
100
+
101
+ # except Exception as e:
102
+ # print(f"❌ Lỗi xử lý Silero VAD: {e}")
103
+
104
+ # def _process_buffer(self):
105
+ # """Xử lý buffer audio với Silero VAD"""
106
+ # try:
107
+ # chunk_size = self.sample_rate # 1 giây
108
+ # if len(self.audio_buffer) < chunk_size:
109
+ # return
110
+
111
+ # # Lấy chunk 1 giây
112
+ # audio_chunk = np.array(self.audio_buffer[:chunk_size])
113
+
114
+ # # Chuẩn hóa audio cho Silero
115
+ # if audio_chunk.dtype != np.float32:
116
+ # audio_chunk = audio_chunk.astype(np.float32)
117
+ # if np.max(np.abs(audio_chunk)) > 1.0:
118
+ # audio_chunk = audio_chunk / 32768.0 # Normalize từ int16
119
+
120
+ # # Đảm bảo audio trong range [-1, 1]
121
+ # audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
122
+
123
+ # # Chuyển thành tensor
124
+ # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
125
+
126
+ # # Phát hiện speech với Silero VAD
127
+ # with torch.no_grad():
128
+ # speech_prob = self.model(audio_tensor, self.sample_rate).item()
129
+
130
+ # print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
131
+
132
+ # # Ngưỡng phát hiện speech
133
+ # if speech_prob > settings.VAD_THRESHOLD:
134
+ # print(f"🎯 Silero VAD phát hiện speech: {speech_prob:.3f}")
135
+
136
+ # # Gọi callback với speech segment
137
+ # if self.speech_callback:
138
+ # self.speech_callback(audio_chunk, self.sample_rate)
139
+
140
+ # # Giữ lại 0.3 giây cuối để overlap
141
+ # keep_samples = int(self.sample_rate * 0.3)
142
+ # if len(self.audio_buffer) > keep_samples:
143
+ # self.audio_buffer = self.audio_buffer[-keep_samples:]
144
+ # else:
145
+ # self.audio_buffer = []
146
+
147
+ # except Exception as e:
148
+ # print(f"❌ Lỗi xử lý Silero VAD buffer: {e}")
149
+ # self.audio_buffer = []
150
+
151
+ # def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
152
+ # """Resample audio nếu cần"""
153
+ # if orig_sr == target_sr:
154
+ # return audio
155
+
156
+ # try:
157
+ # # Simple resampling bằng interpolation
158
+ # orig_length = len(audio)
159
+ # new_length = int(orig_length * target_sr / orig_sr)
160
+
161
+ # # Linear interpolation
162
+ # x_old = np.linspace(0, 1, orig_length)
163
+ # x_new = np.linspace(0, 1, new_length)
164
+ # resampled_audio = np.interp(x_new, x_old, audio)
165
+
166
+ # return resampled_audio
167
+ # except Exception as e:
168
+ # print(f"⚠️ Lỗi resample: {e}")
169
+ # return audio
170
+
171
+ # def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
172
+ # """Kiểm tra xem audio chunk có phải là speech không"""
173
+ # if self.model is None:
174
+ # return True # Fallback: luôn coi là speech
175
+
176
+ # try:
177
+ # # Resample nếu cần
178
+ # if sample_rate != self.sample_rate:
179
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
180
+
181
+ # # Chuẩn hóa audio
182
+ # if audio_chunk.dtype != np.float32:
183
+ # audio_chunk = audio_chunk.astype(np.float32)
184
+ # if np.max(np.abs(audio_chunk)) > 1.0:
185
+ # audio_chunk = audio_chunk / 32768.0
186
+
187
+ # audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
188
+
189
+ # # Đảm bảo độ dài phù hợp
190
+ # if len(audio_chunk) < 512:
191
+ # padding = np.zeros(512 - len(audio_chunk), dtype=np.float32)
192
+ # audio_chunk = np.concatenate([audio_chunk, padding])
193
+
194
+ # # Chuyển thành tensor
195
+ # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
196
+
197
+ # # Phát hiện speech
198
+ # with torch.no_grad():
199
+ # speech_prob = self.model(audio_tensor, self.sample_rate).item()
200
+
201
+ # # Kiểm tra ngưỡng
202
+ # return speech_prob > settings.VAD_THRESHOLD
203
+
204
+ # except Exception as e:
205
+ # print(f"❌ Lỗi kiểm tra speech với Silero: {e}")
206
+ # return True
207
+
208
+ # def get_speech_probability(self, audio_chunk: np.ndarray, sample_rate: int) -> float:
209
+ # """Lấy xác suất speech"""
210
+ # if self.model is None:
211
+ # return 0.0
212
+
213
+ # try:
214
+ # # Resample nếu cần
215
+ # if sample_rate != self.sample_rate:
216
+ # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
217
+
218
+ # # Chuẩn hóa audio
219
+ # if audio_chunk.dtype != np.float32:
220
+ # audio_chunk = audio_chunk.astype(np.float32)
221
+ # if np.max(np.abs(audio_chunk)) > 1.0:
222
+ # audio_chunk = audio_chunk / 32768.0
223
+
224
+ # audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
225
+
226
+ # # Đảm bảo độ dài phù hợp
227
+ # if len(audio_chunk) < 512:
228
+ # padding = np.zeros(512 - len(audio_chunk), dtype=np.float32)
229
+ # audio_chunk = np.concatenate([audio_chunk, padding])
230
+
231
+ # # Chuyển thành tensor
232
+ # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
233
+
234
+ # # Phát hiện speech
235
+ # with torch.no_grad():
236
+ # return self.model(audio_tensor, self.sample_rate).item()
237
+
238
+ # except Exception as e:
239
+ # print(f"❌ Lỗi lấy speech probability: {e}")
240
+ # return 0.0import torch
241
  import numpy as np
242
  from typing import Optional, Callable
243
  from config.settings import settings
244
  import os
245
+ import time
246
 
247
  class SileroVAD:
248
  def __init__(self):
 
251
  self.is_streaming = False
252
  self.speech_callback = None
253
  self.audio_buffer = []
254
+ self.speech_start_time = 0
255
+ self.min_speech_duration = 0.5 # Giây
256
  self._initialize_model()
257
 
258
  def _initialize_model(self):
259
+ """Khởi tạo Silero VAD model"""
260
  try:
261
+ print("🔄 Đang tải Silero VAD model...")
262
 
263
+ # Sử dụng torch.hub
264
  self.model = torch.hub.load(
265
+ repo_or_dir='snakers4/silero-vad',
266
  model='silero_vad',
267
+ force_reload=False,
268
  trust_repo=True
269
  )
270
 
271
+ self.model.eval()
272
  print("✅ Đã tải Silero VAD model thành công")
273
 
274
  except Exception as e:
275
  print(f"❌ Lỗi tải Silero VAD model: {e}")
 
276
  self._initialize_model_fallback()
277
 
278
  def _initialize_model_fallback(self):
279
+ """Fallback method"""
280
  try:
281
+ # Tạo model trực tiếp
282
+ model_dir = torch.hub.get_dir()
283
+ model_path = os.path.join(model_dir, 'snakers4_silero-vad_master', 'files', 'silero_vad.jit')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
+ if os.path.exists(model_path):
286
+ self.model = torch.jit.load(model_path)
287
+ self.model.eval()
288
+ print("✅ Đã tải Silero VAD model thành công (fallback)")
289
+ else:
290
+ print("❌ Không tìm thấy model file")
291
+ self.model = None
292
+
293
  except Exception as e:
294
  print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
295
  self.model = None
 
303
  self.is_streaming = True
304
  self.speech_callback = speech_callback
305
  self.audio_buffer = []
306
+ self.speech_start_time = 0
307
  print("🎙️ Bắt đầu Silero VAD streaming...")
308
  return True
309
 
 
312
  self.is_streaming = False
313
  self.speech_callback = None
314
  self.audio_buffer = []
315
+ self.speech_start_time = 0
316
  print("🛑 Đã dừng Silero VAD streaming")
317
 
318
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
319
+ """Xử lý audio chunk với Silero VAD cải tiến"""
320
  if not self.is_streaming or self.model is None:
321
  return
322
 
 
328
  # Thêm vào buffer
329
  self.audio_buffer.extend(audio_chunk)
330
 
331
+ # Xử lý khi buffer đủ lớn (0.5 giây)
332
  buffer_duration = len(self.audio_buffer) / self.sample_rate
333
+ if buffer_duration >= 0.5:
334
  self._process_buffer()
335
 
336
  except Exception as e:
337
  print(f"❌ Lỗi xử lý Silero VAD: {e}")
338
 
339
  def _process_buffer(self):
340
+ """Xử lý buffer audio với Silero VAD cải tiến"""
341
  try:
342
+ chunk_size = int(self.sample_rate * 0.5) # 0.5 giây
343
  if len(self.audio_buffer) < chunk_size:
344
  return
345
 
346
+ # Lấy chunk
347
  audio_chunk = np.array(self.audio_buffer[:chunk_size])
348
 
349
+ # Chuẩn hóa audio
350
+ audio_chunk = self._normalize_audio(audio_chunk)
 
 
 
 
 
 
 
 
 
351
 
352
+ # Phát hiện speech
353
+ speech_prob = self._get_speech_probability(audio_chunk)
 
354
 
355
  print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
356
 
357
  # Ngưỡng phát hiện speech
358
  if speech_prob > settings.VAD_THRESHOLD:
359
+ current_time = time.time()
360
+
361
+ if self.speech_start_time == 0:
362
+ self.speech_start_time = current_time
363
+ print("🎯 Bắt đầu phát hiện speech")
364
 
365
+ # Gọi callback nếu đủ thời gian speech
366
+ speech_duration = current_time - self.speech_start_time
367
+ if speech_duration >= self.min_speech_duration:
368
+ if self.speech_callback:
369
+ # Lấy toàn bộ audio từ buffer
370
+ full_audio = np.array(self.audio_buffer)
371
+ full_audio = self._normalize_audio(full_audio)
372
+ self.speech_callback(full_audio, self.sample_rate)
373
+
374
+ # Xóa buffer sau khi xử lý
375
+ self.audio_buffer = []
376
+ self.speech_start_time = 0
377
  else:
378
+ # Reset nếu không phải speech
379
+ if self.speech_start_time > 0:
380
+ print("🔇 Kết thúc speech segment")
381
+ self.speech_start_time = 0
382
+
383
+ # Giữ lại 0.2 giây cuối để overlap
384
+ keep_samples = int(self.sample_rate * 0.2)
385
+ if len(self.audio_buffer) > keep_samples:
386
+ self.audio_buffer = self.audio_buffer[-keep_samples:]
387
+ else:
388
+ self.audio_buffer = []
389
 
390
  except Exception as e:
391
  print(f"❌ Lỗi xử lý Silero VAD buffer: {e}")
392
  self.audio_buffer = []
393
+ self.speech_start_time = 0
394
+
395
+ def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
396
+ """Chuẩn hóa audio cho Silero VAD"""
397
+ if audio.dtype != np.float32:
398
+ audio = audio.astype(np.float32)
399
+ if np.max(np.abs(audio)) > 1.0:
400
+ audio = audio / 32768.0 # Normalize từ int16
401
+
402
+ return np.clip(audio, -1.0, 1.0)
403
+
404
+ def _get_speech_probability(self, audio_chunk: np.ndarray) -> float:
405
+ """Lấy xác suất speech từ audio chunk"""
406
+ try:
407
+ # Đảm bảo độ dài phù hợp
408
+ if len(audio_chunk) < 512:
409
+ padding = np.zeros(512 - len(audio_chunk), dtype=np.float32)
410
+ audio_chunk = np.concatenate([audio_chunk, padding])
411
+
412
+ # Chuyển thành tensor
413
+ audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
414
+
415
+ # Phát hiện speech
416
+ with torch.no_grad():
417
+ return self.model(audio_tensor, self.sample_rate).item()
418
+
419
+ except Exception as e:
420
+ print(f"❌ Lỗi lấy speech probability: {e}")
421
+ return 0.0
422
 
423
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
424
+ """Resample audio"""
425
  if orig_sr == target_sr:
426
  return audio
427
 
428
  try:
429
+ # Simple resampling
430
  orig_length = len(audio)
431
  new_length = int(orig_length * target_sr / orig_sr)
432
 
 
433
  x_old = np.linspace(0, 1, orig_length)
434
  x_new = np.linspace(0, 1, new_length)
435
  resampled_audio = np.interp(x_new, x_old, audio)
 
442
  def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
443
  """Kiểm tra xem audio chunk có phải là speech không"""
444
  if self.model is None:
445
+ return True
446
 
447
  try:
448
  # Resample nếu cần
 
450
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
451
 
452
  # Chuẩn hóa audio
453
+ audio_chunk = self._normalize_audio(audio_chunk)
 
 
 
 
 
454
 
455
+ # Lấy xác suất speech
456
+ speech_prob = self._get_speech_probability(audio_chunk)
 
 
 
 
 
 
 
 
 
457
 
 
458
  return speech_prob > settings.VAD_THRESHOLD
459
 
460
  except Exception as e:
 
472
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
473
 
474
  # Chuẩn hóa audio
475
+ audio_chunk = self._normalize_audio(audio_chunk)
 
 
 
 
 
 
 
 
 
 
476
 
477
+ return self._get_speech_probability(audio_chunk)
 
 
 
 
 
478
 
479
  except Exception as e:
480
  print(f"❌ Lỗi lấy speech probability: {e}")