Commit 7b2827be authored by pbethge's avatar pbethge
Browse files

add speechbrain lid example

parent 8afb6556
# Voice Activity Detection for Keyword Spotting
In this example we will use two artificial neural networks. After gathering a chunk of audio we will check if there it contains human speech. This process is called Voice Activity Detection (VAD). If a voice is detected we start accumulating audio in order to feed the Keyword Spotting System (KWS). The KWS may be exchanged with any other AI that feeds on speech.
Parts of this code are heavily borrowed from:
- [silero-vad](https://github.com/snakers4/silero-vad)
- [speech-commands](https://github.com/douglas125/SpeechCmdRecognition)
Please check out [this fork of speech-commands](https://github.com/bytosaur/SpeechCmdRecognition) to train on specific words.
### Installing Python Requirements
__Note__: We suggest using virtual environments for dealing with python code.
```shell
pip install -r requirements
```
### Configuration
The following parameters may be useful to look at
```python
vad_threshold = 0.8 # minimum confidence of the VAD to trigger KWS
kws_threshold = 0.95 # minimum confidence of the KWS to detect a word
kws_required_size = 4 # number of chunks -1 to feed into the KWS
frame_duration_ms = 250 # chunks size for the VAD in milliseconds (250ms is min)
```
The sample rate should be kept at 16kHz for both neural networks. If higher sample rates are required for recordings consider using downsampling.
\ No newline at end of file
import io
import numpy as np
import pyaudio
import scipy.io.wavfile as wav
import threading
from queue import Queue
import time
import torch
torch.set_num_threads(1)
import torchaudio
torchaudio.set_audio_backend("soundfile")
from utils import *
from speechbrain.pretrained import EncoderClassifier
# Configure
vad_threshold = 0.8 # minimum confidence of the VAD to trigger lid
lid_threshold = 0.8 # minimum confidence of the lid to detect a word
lid_required_size = 20 # number of chunks -1 to feed into the lid
frame_duration_ms = 250 # chunks size for the VAD in milliseconds (250ms is min)
# Pyaudio
FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPLE_RATE = 16000
CHUNK = int(SAMPLE_RATE / 10)
audio = pyaudio.PyAudio()
chunk_size = int(SAMPLE_RATE * frame_duration_ms / 1000.0)
#=== Silero VAD ===#
model = torch.jit.load('vad_model.jit')
def normalize(sound):
abs_max = np.abs(sound).max()
if abs_max > 0:
sound *= 1/abs_max
sound = sound.squeeze() # depends on the use case
return sound
#=== LID ===#
classes = []
# lid process thread
def processAudio(config, q):
model = EncoderClassifier.from_hparams(
"speechbrain/lang-id-commonlanguage_ecapa"
)
while True:
try:
data = q.get()
data = [item for sublist in data for item in sublist]
data_tensor = torch.tensor(data, dtype=float)
print(data_tensor)
# data_tensor /= 32768.0
data_tensor = torch.unsqueeze(data_tensor, 0)
out = model.classify_batch(data_tensor)
print(out[3])
# out = lid_model(data_tensor)[0].numpy()
# index = tf.math.argmax(out).numpy()
# if out[index] >= lid_threshold:
# print(classes[index])
# if you want to see the data please uncomment
# wav.write('results/'+classes[index]+'.wav', SAMPLE_RATE, np.asarray(data))
except Exception as e:
print("Ooopsi: ", e)
q.task_done()
# Threading
queue = Queue()
some_config = ''
worker = threading.Thread(target=processAudio, args=(some_config, queue), daemon=True)
worker.start()
data = []
audio_int16 = []
nu_voice_chunks = 0
got_voice = False
# wait a bit for libraries to load
print("Loading libraries...")
print("This may take up to 10 seconds!")
time.sleep(7)
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK)
print("Listening...")
while True:
# keep the last chunk so nothing gets lost
last_chunk = audio_int16
# sample chunk, convert to float and normalize
audio_chunk = stream.read(chunk_size)
audio_int16 = np.frombuffer(audio_chunk, np.int16)
audio_float32 = audio_int16.astype('float32')
audio_float32_norm = normalize(audio_float32)
# get the confidences
vad_outs = validate(model, torch.from_numpy(audio_float32_norm))[:,1]
# trigger if voice is detected
if vad_outs >= vad_threshold and not got_voice:
print("Voice detected! Recording...")
got_voice = True
data = []
data.append(last_chunk)
# collect data and analyze
if got_voice:
if nu_voice_chunks < lid_required_size:
data.append(audio_int16)
nu_voice_chunks += 1
else:
got_voice = False
nu_voice_chunks = 0
queue.put(data)
queue.join()
\ No newline at end of file
# Silero VAD
numpy==1.19.5
torch==1.8.1
matplotlib==3.4.2
torchaudio==0.8.1
soundfile==0.10.3.post1
pyaudio==0.2.11
# KWS
speechbrain
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment