Bob 2.0 implementation of Voice Activity Detection (VAD) based on 4Hz energy filtering
Algorithms have at least one input and one output. All algorithm endpoints are organized in groups. Groups are used by the platform to indicate which inputs and outputs are synchronized together. The first group is automatically synchronized with the channel defined by the block in which the algorithm is deployed.
Endpoint Name | Data Format | Nature |
---|---|---|
speech | system/array_1d_floats/1 | Input |
labels | system/array_1d_integers/1 | Output |
Parameters allow users to change the configuration of an algorithm when scheduling an experiment
Name | Description | Type | Default | Range/Choices |
---|---|---|---|---|
rate | Sampling rate of the speech signal | float64 | 16000.0 | [2000.0, 256000.0] |
win_length_ms | The length of the sliding processing window, typically about 20 ms | float64 | 20.0 | |
win_shift_ms | The length of the overlap between neighboring windows. Typically the half of window length. | float64 | 10.0 |
xxxxxxxxxx
###############################################################################
# #
# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# This file is part of the beat.core module of the BEAT platform. #
# #
# Commercial License Usage #
# Licensees holding valid commercial BEAT licenses may use this file in #
# accordance with the terms contained in a written agreement between you #
# and Idiap. For further information contact tto@idiap.ch #
# #
# Alternatively, this file may be used under the terms of the GNU Affero #
# Public License version 3 as published by the Free Software and appearing #
# in the file LICENSE.AGPL included in the packaging of this file. #
# The BEAT platform is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
# or FITNESS FOR A PARTICULAR PURPOSE. #
# #
# You should have received a copy of the GNU Affero Public License along #
# with the BEAT platform. If not, see http://www.gnu.org/licenses/. #
# #
###############################################################################
import numpy
import math
import scipy.signal
import bob.ap
class Mod_4Hz():
"""VAD based on the modulation of the energy around 4 Hz and the energy """
def __init__(
self,
max_iterations = 10, # 10 iterations for the
convergence_threshold = 0.0005,
variance_threshold = 0.0005,
win_length_ms = 20., # 20 ms
win_shift_ms = 10., # 10 ms
smoothing_window = 10, # 10 frames (i.e. 100 ms)
n_filters = 40,
f_min = 0.0, # 0 Hz
f_max = 4000, # 4 KHz
pre_emphasis_coef = 1.0,
ratio_threshold = 0.1, # 0.1 of the maximum energy
**kwargs
):
# copy parameters
self.max_iterations = max_iterations
self.convergence_threshold = convergence_threshold
self.variance_threshold = variance_threshold
self.win_length_ms = win_length_ms
self.win_shift_ms = win_shift_ms
self.smoothing_window = smoothing_window
self.n_filters = n_filters
self.f_min = f_min
self.f_max = f_max
self.pre_emphasis_coef = pre_emphasis_coef
self.ratio_threshold = ratio_threshold
def _voice_activity_detection(self, energy, mod_4hz):
n_samples = len(energy)
threshold = numpy.max(energy) - numpy.log((1./self.ratio_threshold) * (1./self.ratio_threshold))
labels = numpy.array(numpy.zeros(n_samples), dtype=numpy.int16)
for i in range(n_samples):
if ( energy[i] > threshold and mod_4hz[i] > 0.9 ):
labels[i]=1
# If speech part less then 10 seconds and less than the half of the segment duration, try to find speech with more risk
if numpy.sum(labels) < 2000 and float(numpy.sum(labels)) / float(len(labels)) < 0.5:
# TRY WITH MORE RISK 1...
for i in range(n_samples):
if ( energy[i] > threshold and mod_4hz[i] > 0.5 ):
labels[i]=1
if numpy.sum(labels) < 2000 and float(numpy.sum(labels)) / float(len(labels)) < 0.5:
# TRY WITH MORE RISK 2...
for i in range(n_samples):
if ( energy[i] > threshold and mod_4hz[i] > 0.2 ):
labels[i]=1
if numpy.sum(labels) < 2000 and float(numpy.sum(labels)) / float(len(labels)) < 0.5: # This is special for short segments (less than 2s)...
# TRY WITH MORE RISK 3...
if (len(energy) < 200 ) or (numpy.sum(labels) == 0) or (numpy.mean(labels)<0.025):
for i in range(n_samples):
if ( energy[i] > threshold ):
labels[i]=1
return labels
def averaging(self, list_1s_shift):
len_list=len(list_1s_shift)
sample_level_value = numpy.array(numpy.zeros(len_list, dtype=numpy.float))
sample_level_value[0]=numpy.array(list_1s_shift[0])
for j in range(2, numpy.min([len_list, 100])):
sample_level_value[j-1]=((j-1.0)/j)*sample_level_value[j-2] +(1.0/j)*numpy.array(list_1s_shift[j-1])
for j in range(numpy.min([len_list, 100]), len_list-100 +1):
sample_level_value[j-1]=numpy.array(numpy.mean(list_1s_shift[j-100:j]))
sample_level_value[len_list-1] = list_1s_shift[len_list -1]
for j in range(2, numpy.min([len_list, 100]) + 1):
sample_level_value[len_list-j]=((j-1.0)/j)*sample_level_value[len_list+1-j] +(1.0/j)*numpy.array(list_1s_shift[len_list-j])
return sample_level_value
def bandpass_firwin(self, ntaps, lowcut, highcut, fs, window='hamming'):
nyq = 0.5 * fs
taps = scipy.signal.firwin(ntaps, [lowcut, highcut], nyq=nyq, pass_zero=False,
window=window, scale=True)
return taps
def pass_band_filtering(self, energy_bands, fs):
energy_bands = energy_bands.T
order = 8
Wo = 4.
num_taps = self.bandpass_firwin(order+1, (Wo - 0.5), (Wo + 0.5), fs)
res = scipy.signal.lfilter(num_taps, 1.0, energy_bands)
return res
def modulation_4hz(self, filtering_res, rate_wavsample):
fs = rate_wavsample[0]
win_length = int (fs * self.win_length_ms / 1000)
win_shift = int (fs * self.win_shift_ms / 1000)
Energy = filtering_res.sum(axis=0)
mean_Energy = numpy.mean(Energy)
Energy = Energy/mean_Energy
# win_size = int (2.0 ** math.ceil(math.log(win_length) / math.log(2)))
n_frames = 1 + (rate_wavsample[1].shape[0] - win_length) // win_shift
range_modulation = int(fs/win_length) # This corresponds to 1 sec
res = numpy.zeros(n_frames)
if n_frames < range_modulation:
return res
for w in range(0,n_frames-range_modulation):
E_range=Energy[w:w+range_modulation] # computes the modulation every 10 ms
if (E_range<=0.).any():
res[w] = 0
else:
res[w] = numpy.var(numpy.log(E_range))
res[n_frames-range_modulation:n_frames] = res[n_frames-range_modulation-1]
return res
def smoothing(self, labels, smoothing_window):
""" Applies a smoothing on VAD"""
if numpy.sum(labels)< smoothing_window:
return labels
segments = []
for k in range(1,len(labels)-1):
if labels[k]==0 and labels[k-1]==1 and labels[k+1]==1 :
labels[k]=1
for k in range(1,len(labels)-1):
if labels[k]==1 and labels[k-1]==0 and labels[k+1]==0 :
labels[k]=0
seg = numpy.array([0,0,labels[0]])
for k in range(1,len(labels)):
if labels[k] != labels[k-1]:
seg[1]=k-1
segments.append(seg)
seg = numpy.array([k,k,labels[k]])
seg[1]=len(labels)-1
segments.append(seg)
if len(segments) < 2:
return labels
curr = segments[0]
next = segments[1]
# Look at the first segment. If it's short enough, just change its labels
if (curr[1]-curr[0]+1) < smoothing_window and (next[1]-next[0]+1) > smoothing_window:
if curr[2]==1:
labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
curr[2]=0
else: #curr[2]==0
labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
curr[2]=1
for k in range(1,len(segments)-1):
prev = segments[k-1]
curr = segments[k]
next = segments[k+1]
if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window and (next[1]-next[0]+1) > smoothing_window:
if curr[2]==1:
labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
curr[2]=0
else: #curr[2]==0
labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
curr[2]=1
prev = segments[-2]
curr = segments[-1]
if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window:
if curr[2]==1:
labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
curr[2]=0
else: #if curr[2]==0
labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
curr[2]=1
return labels
def mod_4hz(self, rate_wavsample):
"""Computes and returns the 4Hz modulation energy features for the given input wave file"""
# Set parameters
wl = float(self.win_length_ms)
ws = float(self.win_shift_ms)
nf = self.n_filters
f_min = float(self.f_min)
f_max = float(self.f_max)
pre = float(self.pre_emphasis_coef)
c = bob.ap.Spectrogram(float(rate_wavsample[0]), float(wl), float(ws), nf, float(f_min), float(f_max), float(pre))
c.energy_filter=True
c.log_filter=False
c.energy_bands=True
sig = rate_wavsample[1]
energy_bands = c(sig)
filtering_res = self.pass_band_filtering(energy_bands, rate_wavsample[0])
mod_4hz = self.modulation_4hz(filtering_res, rate_wavsample)
mod_4hz = self.averaging(mod_4hz)
e = bob.ap.Energy(float(rate_wavsample[0]), float(wl), float(ws))
energy_array = e(rate_wavsample[1])
labels = self._voice_activity_detection(energy_array, mod_4hz)
labels = self.smoothing(labels,self.smoothing_window) # discard isolated speech less than 100ms
return labels, energy_array, mod_4hz
def __call__(self, input_signal, annotations=None):
"""labels speech (1) and non-speech (0) parts of the given input wave file using 4Hz modulation energy and energy
Input parameter:
* input_signal[0] --> rate
* input_signal[1] --> signal
"""
[labels, energy_array, mod_4hz] = self.mod_4hz(input_signal)
rate = input_signal[0]
data = input_signal[1]
return rate, data, labels
class Algorithm:
def __init__(self):
self.win_length_ms = 20
self.win_shift_ms = 10
self.rate = 16000
def setup(self, parameters):
self.rate = parameters.get('rate', self.rate)
self.win_length_ms = parameters.get('win_length_ms', self.win_length_ms)
self.win_shift_ms = parameters.get('win_shift_ms', self.win_shift_ms)
self.preprocessor = Mod_4Hz(win_length_ms=self.win_length_ms, win_shift_ms=self.win_shift_ms)
return True
def process(self, inputs, outputs):
float_wav = inputs["speech"].data.value.astype('float64')
if float_wav is None or not float_wav.size:
labels = numpy.zeros(2, dtype=numpy.int8)
else:
[labels, energies, mod_4hz] = self.preprocessor.mod_4hz([self.rate, float_wav])
outputs["labels"].write({
'value': labels
})
return True
The code for this algorithm in Python
The ruler at 80 columns indicate suggested POSIX line breaks (for readability).
The editor will automatically enlarge to accomodate the entirety of your input
Use keyboard shortcuts for search/replace and faster editing. For example, use Ctrl-F (PC) or Cmd-F (Mac) to search through this box
4Hz modulation of energy voice activity detection (VAD) with carefully tuned thresholds.
Updated | Name | Databases/Protocols | Analyzers | |||
---|---|---|---|---|---|---|
pkorshunov/pkorshunov/isv-asv-pad-fusion-complete/1/asv_isv-pad_lbp_hist_ratios_lr-fusion_lr-pa_aligned | avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verify_train,avspoof/2@physicalaccess_verify_train_spoof,avspoof/2@physicalaccess_antispoofing,avspoof/2@physicalaccess_verification_spoof | pkorshunov/spoof-score-fusion-roc_hist/1 | ||||
pkorshunov/pkorshunov/speech-pad-simple/1/speech-pad_lbp_hist_ratios_lr-pa_aligned | avspoof/2@physicalaccess_antispoofing | pkorshunov/simple_antispoofing_analyzer/4 | ||||
pkorshunov/pkorshunov/isv-asv-pad-fusion-complete/1/asv_isv-pad_gmm-fusion_lr-pa | avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verify_train,avspoof/2@physicalaccess_verify_train_spoof,avspoof/2@physicalaccess_antispoofing,avspoof/2@physicalaccess_verification_spoof | pkorshunov/spoof-score-fusion-roc_hist/1 | ||||
pkorshunov/pkorshunov/speech-pad-simple/1/speech-pad_gmm-pa | avspoof/2@physicalaccess_antispoofing | pkorshunov/simple_antispoofing_analyzer/4 | ||||
pkorshunov/pkorshunov/isv-speaker-verification-spoof/1/isv-speaker-verification-spoof-pa | avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verification_spoof | pkorshunov/eerhter_postperf_iso_spoof/1 | ||||
pkorshunov/pkorshunov/isv-speaker-verification/1/isv-speaker-verification-licit | avspoof/2@physicalaccess_verification | pkorshunov/eerhter_postperf_iso/1 | ||||
pkorshunov/pkorshunov/speech-antispoofing-baseline/1/btas2016-baseline-pa | avspoof/1@physicalaccess_antispoofing | pkorshunov/simple_antispoofing_analyzer/2 |
This table shows the number of times this algorithm has been successfully run using the given environment. Note this does not provide sufficient information to evaluate if the algorithm will run when submitted to different conditions.