-
-
Notifications
You must be signed in to change notification settings - Fork 33.4k
/
Copy pathvad.py
324 lines (252 loc) · 10.8 KB
/
vad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""Voice activity detection."""
from __future__ import annotations
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from enum import StrEnum
import logging
from .const import SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
_LOGGER = logging.getLogger(__name__)
class VadSensitivity(StrEnum):
"""How quickly the end of a voice command is detected."""
DEFAULT = "default"
RELAXED = "relaxed"
AGGRESSIVE = "aggressive"
@staticmethod
def to_seconds(sensitivity: VadSensitivity | str) -> float:
"""Return seconds of silence for sensitivity level."""
sensitivity = VadSensitivity(sensitivity)
if sensitivity == VadSensitivity.RELAXED:
return 1.25
if sensitivity == VadSensitivity.AGGRESSIVE:
return 0.25
return 0.7
class AudioBuffer:
"""Fixed-sized audio buffer with variable internal length."""
def __init__(self, maxlen: int) -> None:
"""Initialize buffer."""
self._buffer = bytearray(maxlen)
self._length = 0
@property
def length(self) -> int:
"""Get number of bytes currently in the buffer."""
return self._length
def clear(self) -> None:
"""Clear the buffer."""
self._length = 0
def append(self, data: bytes) -> None:
"""Append bytes to the buffer, increasing the internal length."""
data_len = len(data)
if (self._length + data_len) > len(self._buffer):
raise ValueError("Length cannot be greater than buffer size")
self._buffer[self._length : self._length + data_len] = data
self._length += data_len
def bytes(self) -> bytes:
"""Convert written portion of buffer to bytes."""
return bytes(self._buffer[: self._length])
def __len__(self) -> int:
"""Get the number of bytes currently in the buffer."""
return self._length
def __bool__(self) -> bool:
"""Return True if there are bytes in the buffer."""
return self._length > 0
@dataclass
class VoiceCommandSegmenter:
"""Segments an audio stream into voice commands."""
speech_seconds: float = 0.1
"""Seconds of speech before voice command has started."""
command_seconds: float = 1.0
"""Minimum number of seconds for a voice command."""
silence_seconds: float = 0.7
"""Seconds of silence after voice command has ended."""
timeout_seconds: float = 15.0
"""Maximum number of seconds before stopping with timeout=True."""
reset_seconds: float = 1.0
"""Seconds before reset start/stop time counters."""
in_command: bool = False
"""True if inside voice command."""
timed_out: bool = False
"""True a timeout occurred during voice command."""
before_command_speech_threshold: float = 0.2
"""Probability threshold for speech before voice command."""
in_command_speech_threshold: float = 0.5
"""Probability threshold for speech during voice command."""
_speech_seconds_left: float = 0.0
"""Seconds left before considering voice command as started."""
_command_seconds_left: float = 0.0
"""Seconds left before voice command could stop."""
_silence_seconds_left: float = 0.0
"""Seconds left before considering voice command as stopped."""
_timeout_seconds_left: float = 0.0
"""Seconds left before considering voice command timed out."""
_reset_seconds_left: float = 0.0
"""Seconds left before resetting start/stop time counters."""
def __post_init__(self) -> None:
"""Reset after initialization."""
self.reset()
def reset(self) -> None:
"""Reset all counters and state."""
self._speech_seconds_left = self.speech_seconds
self._command_seconds_left = self.command_seconds - self.speech_seconds
self._silence_seconds_left = self.silence_seconds
self._timeout_seconds_left = self.timeout_seconds
self._reset_seconds_left = self.reset_seconds
self.in_command = False
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
"""Process samples using external VAD.
Returns False when command is done.
"""
if self.timed_out:
self.timed_out = False
self._timeout_seconds_left -= chunk_seconds
if self._timeout_seconds_left <= 0:
_LOGGER.debug(
"VAD end of speech detection timed out after %s seconds",
self.timeout_seconds,
)
self.reset()
self.timed_out = True
return False
if speech_probability is None:
speech_probability = 0.0
if not self.in_command:
# Before command
is_speech = speech_probability > self.before_command_speech_threshold
if is_speech:
self._reset_seconds_left = self.reset_seconds
self._speech_seconds_left -= chunk_seconds
if self._speech_seconds_left <= 0:
# Inside voice command
self.in_command = True
self._command_seconds_left = (
self.command_seconds - self.speech_seconds
)
self._silence_seconds_left = self.silence_seconds
_LOGGER.debug("Voice command started")
else:
# Reset if enough silence
self._reset_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._speech_seconds_left = self.speech_seconds
self._reset_seconds_left = self.reset_seconds
else:
# In command
is_speech = speech_probability > self.in_command_speech_threshold
if not is_speech:
# Silence in command
self._reset_seconds_left = self.reset_seconds
self._silence_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if (self._silence_seconds_left <= 0) and (
self._command_seconds_left <= 0
):
# Command finished successfully
self.reset()
_LOGGER.debug("Voice command finished")
return False
else:
# Speech in command.
# Reset silence counter if enough speech.
self._reset_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._silence_seconds_left = self.silence_seconds
self._reset_seconds_left = self.reset_seconds
return True
def process_with_vad(
self,
chunk: bytes,
vad_samples_per_chunk: int | None,
vad_is_speech: Callable[[bytes], bool],
leftover_chunk_buffer: AudioBuffer | None,
) -> bool:
"""Process an audio chunk using an external VAD.
A buffer is required if the VAD requires fixed-sized audio chunks (usually the case).
Returns False when voice command is finished.
"""
if vad_samples_per_chunk is None:
# No chunking
chunk_seconds = (
len(chunk) // (SAMPLE_WIDTH * SAMPLE_CHANNELS)
) / SAMPLE_RATE
is_speech = vad_is_speech(chunk)
return self.process(chunk_seconds, is_speech)
if leftover_chunk_buffer is None:
raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
# With chunking
seconds_per_chunk = vad_samples_per_chunk / SAMPLE_RATE
bytes_per_chunk = vad_samples_per_chunk * (SAMPLE_WIDTH * SAMPLE_CHANNELS)
for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
is_speech = vad_is_speech(vad_chunk)
if not self.process(seconds_per_chunk, is_speech):
return False
return True
@dataclass
class VoiceActivityTimeout:
"""Detects silence in audio until a timeout is reached."""
silence_seconds: float
"""Seconds of silence before timeout."""
reset_seconds: float = 0.5
"""Seconds of speech before resetting timeout."""
speech_threshold: float = 0.5
"""Threshold for speech."""
_silence_seconds_left: float = 0.0
"""Seconds left before considering voice command as stopped."""
_reset_seconds_left: float = 0.0
"""Seconds left before resetting start/stop time counters."""
def __post_init__(self) -> None:
"""Reset after initialization."""
self.reset()
def reset(self) -> None:
"""Reset all counters and state."""
self._silence_seconds_left = self.silence_seconds
self._reset_seconds_left = self.reset_seconds
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
"""Process samples using external VAD.
Returns False when timeout is reached.
"""
if speech_probability is None:
speech_probability = 0.0
if speech_probability > self.speech_threshold:
# Speech
self._reset_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
# Reset timeout
self._silence_seconds_left = self.silence_seconds
else:
# Silence
self._silence_seconds_left -= chunk_seconds
if self._silence_seconds_left <= 0:
# Timeout reached
self.reset()
return False
# Slowly build reset counter back up
self._reset_seconds_left = min(
self.reset_seconds, self._reset_seconds_left + chunk_seconds
)
return True
def chunk_samples(
samples: bytes,
bytes_per_chunk: int,
leftover_chunk_buffer: AudioBuffer,
) -> Iterable[bytes]:
"""Yield fixed-sized chunks from samples, keeping leftover bytes from previous call(s)."""
if (len(leftover_chunk_buffer) + len(samples)) < bytes_per_chunk:
# Extend leftover chunk, but not enough samples to complete it
leftover_chunk_buffer.append(samples)
return
next_chunk_idx = 0
if leftover_chunk_buffer:
# Add to leftover chunk from previous call(s).
bytes_to_copy = bytes_per_chunk - len(leftover_chunk_buffer)
leftover_chunk_buffer.append(samples[:bytes_to_copy])
next_chunk_idx = bytes_to_copy
# Process full chunk in buffer
yield leftover_chunk_buffer.bytes()
leftover_chunk_buffer.clear()
while next_chunk_idx < len(samples) - bytes_per_chunk + 1:
# Process full chunk
yield samples[next_chunk_idx : next_chunk_idx + bytes_per_chunk]
next_chunk_idx += bytes_per_chunk
# Capture leftover chunks
if rest_samples := samples[next_chunk_idx:]:
leftover_chunk_buffer.append(rest_samples)