Bob 2.0 implementation of VAD based on signal's energy

This algorithm is a legacy one. The API has changed since its implementation. New versions and forks will need to be updated.
This algorithm is splittable

Algorithms have at least one input and one output. All algorithm endpoints are organized in groups. Groups are used by the platform to indicate which inputs and outputs are synchronized together. The first group is automatically synchronized with the channel defined by the block in which the algorithm is deployed.

Group: main

Endpoint Name Data Format Nature
speech system/array_1d_floats/1 Input
vad system/array_1d_integers/1 Output

Parameters allow users to change the configuration of an algorithm when scheduling an experiment

Name Description Type Default Range/Choices
rate sampling rate float64 16000.0
xxxxxxxxxx
215
 
1
import numpy
2
import bob.core
3
import bob.ap
4
import bob.learn.em
5
6
import logging
7
logger = logging.getLogger("bob.c++")
8
9
10
11
def normalize_std_array(vector):
12
  """Applies a unit mean and variance normalization to an arrayset"""
13
14
  # Initializes variables
15
  length = 1
16
  n_samples = len(vector)
17
  mean = numpy.ndarray((length,), 'float64')
18
  std = numpy.ndarray((length,), 'float64')
19
20
  mean.fill(0)
21
  std.fill(0)
22
23
  # Computes mean and variance
24
  for array in vector:
25
    x = array.astype('float64')
26
    mean += x
27
    std += (x ** 2)
28
29
  mean /= n_samples
30
  std /= n_samples
31
  std -= (mean ** 2)
32
  std = std ** 0.5
33
  arrayset = numpy.ndarray(shape=(n_samples,mean.shape[0]), dtype=numpy.float64)
34
35
  for i in range (0, n_samples):
36
    arrayset[i,:] = (vector[i]-mean) / std
37
  return arrayset
38
39
40
def smoothing(labels, smoothing_window):
41
  """ Applies a smoothing on VAD"""
42
43
  if numpy.sum(labels)< smoothing_window:
44
    return labels
45
  segments = []
46
  for k in range(1,len(labels)-1):
47
    if labels[k]==0 and labels[k-1]==1 and labels[k+1]==1 :
48
      labels[k]=1
49
  for k in range(1,len(labels)-1):
50
    if labels[k]==1 and labels[k-1]==0 and labels[k+1]==0 :
51
      labels[k]=0
52
53
  seg = numpy.array([0,0,labels[0]])
54
  for k in range(1,len(labels)):
55
    if labels[k] != labels[k-1]:
56
      seg[1]=k-1
57
      segments.append(seg)
58
      seg = numpy.array([k,k,labels[k]])
59
  seg[1]=len(labels)-1
60
  segments.append(seg)
61
62
  if len(segments) < 2:
63
    return labels
64
65
  curr = segments[0]
66
  next = segments[1]
67
68
  # Look at the first segment. If it's short enough, just change its labels
69
  if (curr[1]-curr[0]+1) < smoothing_window and (next[1]-next[0]+1) > smoothing_window:
70
    if curr[2]==1:
71
      labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
72
      curr[2]=0
73
    else: #curr[2]==0
74
      labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
75
      curr[2]=1
76
77
  for k in range(1,len(segments)-1):
78
    prev = segments[k-1]
79
    curr = segments[k]
80
    next = segments[k+1]
81
82
    if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window and (next[1]-next[0]+1) > smoothing_window:
83
      if curr[2]==1:
84
        labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
85
        curr[2]=0
86
      else: #curr[2]==0
87
        labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
88
        curr[2]=1
89
90
91
  prev = segments[-2]
92
  curr = segments[-1]
93
94
  if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window:
95
    if curr[2]==1:
96
      labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
97
      curr[2]=0
98
    else: #if curr[2]==0
99
      labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
100
      curr[2]=1
101
102
  return labels
103
104
105
106
107
108
109
class Algorithm:
110
111
  def __init__(self):
112
    # Cepstral parameters
113
    self.win_length_ms = 20
114
    self.win_shift_ms = 10
115
    # VAD parameters
116
    self.max_iterations = 10
117
    self.smoothing_window = 10 # This corresponds to 100ms
118
    self.rate = 16000
119
120
121
  def _voice_activity_detection(self, energy_array):
122
    #########################
123
    ## Initialisation part ##
124
    #########################
125
    max_iterations = self.max_iterations
126
    n_samples = len(energy_array)
127
128
    normalized_energy = normalize_std_array(energy_array)
129
130
    kmeans = bob.learn.em.KMeansMachine(2, 1)
131
132
    logger_propagate = logger.propagate
133
    # Mute logger propagation
134
    if logger_propagate:
135
      logger.propagate = False
136
    m_ubm = bob.learn.em.GMMMachine(2, 1)
137
138
    kmeans_trainer = bob.learn.em.KMeansTrainer('RANDOM_NO_DUPLICATE')
139
    convergence_threshold = 0.0005
140
    max_iterations = max_iterations
141
142
    # Trains using the KMeansTrainer
143
    bob.learn.em.train(kmeans_trainer, kmeans, normalized_energy, max_iterations, convergence_threshold)
144
145
146
    [variances, weights] = kmeans.get_variances_and_weights_for_each_cluster(normalized_energy)
147
    means = kmeans.means
148
    if numpy.isnan(means[0]) or numpy.isnan(means[1]):
149
      #print("Warning: skip this file")
150
      return numpy.array(numpy.zeros(n_samples), dtype=numpy.int16)
151
    # Initializes the GMM
152
    m_ubm.means = means
153
154
    m_ubm.variances = variances
155
    m_ubm.weights = weights
156
    m_ubm.set_variance_thresholds(0.0005)
157
158
    trainer = bob.learn.em.ML_GMMTrainer(True, True, True)
159
    gmm_max_iterations = 25
160
161
    bob.learn.em.train(trainer, m_ubm, normalized_energy, gmm_max_iterations, convergence_threshold)
162
    means = m_ubm.means
163
    weights = m_ubm.weights
164
165
    # Enable logger propagation again
166
    if logger_propagate:
167
      logger.propagate = True
168
169
    if means[0] < means[1]:
170
      higher = 1
171
      lower = 0
172
    else:
173
      higher = 0
174
      lower = 1
175
176
    label = numpy.array(numpy.ones(n_samples), dtype=numpy.int16)
177
178
    higher_mean_gauss = m_ubm.get_gaussian(higher)
179
    lower_mean_gauss = m_ubm.get_gaussian(lower)
180
181
    k=0
182
    for i in range(n_samples):
183
      if higher_mean_gauss.log_likelihood(normalized_energy[i]) < lower_mean_gauss.log_likelihood( normalized_energy[i]):
184
        label[i]=0
185
      else:
186
        label[i]=label[i] * 1
187
    #print("After Energy-based VAD there are %d frames remaining over %d" %(numpy.sum(label), len(label)))
188
189
    return label
190
191
192
  def setup(self, parameters):
193
    self.rate = parameters.get('rate', self.rate)
194
    wl = self.win_length_ms
195
    ws = self.win_shift_ms
196
    max_iterations = self.max_iterations
197
    smoothing_window = self.smoothing_window
198
    rate = self.rate
199
    self.preprocessor = bob.ap.Energy(rate, wl, ws)
200
    return True
201
202
203
  def process(self, inputs, outputs):
204
    float_wav = inputs["speech"].data.value
205
    energy_array = self.preprocessor(float_wav)
206
    labels = self._voice_activity_detection(energy_array)
207
208
    vad_labels = smoothing(labels,10) # discard isolated speech less than 100ms
209
210
    outputs["vad"].write({
211
      'value':vad_labels
212
    })
213
214
    return True
215

The code for this algorithm in Python
The ruler at 80 columns indicate suggested POSIX line breaks (for readability).
The editor will automatically enlarge to accomodate the entirety of your input
Use keyboard shortcuts for search/replace and faster editing. For example, use Ctrl-F (PC) or Cmd-F (Mac) to search through this box

Raw content

Experiments

Updated Name Databases/Protocols Analyzers
pkorshunov/pkorshunov/isv-asv-pad-fusion-complete/1/asv_isv-pad_lbp_hist_ratios_lr-fusion_lr-pa_aligned avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verify_train,avspoof/2@physicalaccess_verify_train_spoof,avspoof/2@physicalaccess_antispoofing,avspoof/2@physicalaccess_verification_spoof pkorshunov/spoof-score-fusion-roc_hist/1
pkorshunov/pkorshunov/isv-asv-pad-fusion-complete/1/asv_isv-pad_gmm-fusion_lr-pa avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verify_train,avspoof/2@physicalaccess_verify_train_spoof,avspoof/2@physicalaccess_antispoofing,avspoof/2@physicalaccess_verification_spoof pkorshunov/spoof-score-fusion-roc_hist/1
pkorshunov/pkorshunov/isv-speaker-verification-spoof/1/isv-speaker-verification-spoof-pa avspoof/2@physicalaccess_verification,avspoof/2@physicalaccess_verification_spoof pkorshunov/eerhter_postperf_iso_spoof/1
pkorshunov/pkorshunov/isv-speaker-verification/1/isv-speaker-verification-licit avspoof/2@physicalaccess_verification pkorshunov/eerhter_postperf_iso/1
Created with Raphaël 2.1.2[compare]pkorshunov/energy/12016Mar10

This table shows the number of times this algorithm has been successfully run using the given environment. Note this does not provide sufficient information to evaluate if the algorithm will run when submitted to different conditions.

Terms of Service | Contact Information | BEAT platform version 2.2.1b0 | © Idiap Research Institute - 2013-2025