Implements the energy-based voice activity detection

This algorithm is a legacy one. The API has changed since its implementation. New versions and forks will need to be updated.

Algorithms have at least one input and one output. All algorithm endpoints are organized in groups. Groups are used by the platform to indicate which inputs and outputs are synchronized together. The first group is automatically synchronized with the channel defined by the block in which the algorithm is deployed.

Unnamed group

Endpoint Name Data Format Nature
speech system/array_1d_floats/1 Input
vad system/array_1d_integers/1 Output

Parameters allow users to change the configuration of an algorithm when scheduling an experiment

Name Description Type Default Range/Choices
rate float64 16000.0
xxxxxxxxxx
217
 
1
import bob
2
import numpy
3
4
import logging
5
logger = logging.getLogger("bob.c++")
6
7
8
9
def normalize_std_array(vector):
10
  """Applies a unit mean and variance normalization to an arrayset"""
11
12
  # Initializes variables
13
  length = 1
14
  n_samples = len(vector)
15
  mean = numpy.ndarray((length,), 'float64')
16
  std = numpy.ndarray((length,), 'float64')
17
18
  mean.fill(0)
19
  std.fill(0)
20
21
  # Computes mean and variance
22
  for array in vector:
23
    x = array.astype('float64')
24
    mean += x
25
    std += (x ** 2)
26
27
  mean /= n_samples
28
  std /= n_samples
29
  std -= (mean ** 2)
30
  std = std ** 0.5 
31
  arrayset = numpy.ndarray(shape=(n_samples,mean.shape[0]), dtype=numpy.float64)
32
    
33
  for i in range (0, n_samples):
34
    arrayset[i,:] = (vector[i]-mean) / std 
35
  return arrayset
36
37
    
38
def smoothing(labels, smoothing_window):
39
  """ Applies a smoothing on VAD"""
40
  
41
  if numpy.sum(labels)< smoothing_window:
42
    return labels
43
  segments = []
44
  for k in range(1,len(labels)-1):
45
    if labels[k]==0 and labels[k-1]==1 and labels[k+1]==1 :
46
      labels[k]=1
47
  for k in range(1,len(labels)-1):
48
    if labels[k]==1 and labels[k-1]==0 and labels[k+1]==0 :
49
      labels[k]=0
50
   
51
  seg = numpy.array([0,0,labels[0]])
52
  for k in range(1,len(labels)):
53
    if labels[k] != labels[k-1]:
54
      seg[1]=k-1
55
      segments.append(seg)
56
      seg = numpy.array([k,k,labels[k]])
57
  seg[1]=len(labels)-1
58
  segments.append(seg)
59
60
  if len(segments) < 2:
61
    return labels
62
      
63
  curr = segments[0]
64
  next = segments[1]
65
    
66
  # Look at the first segment. If it's short enough, just change its labels 
67
  if (curr[1]-curr[0]+1) < smoothing_window and (next[1]-next[0]+1) > smoothing_window:
68
    if curr[2]==1:
69
      labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
70
      curr[2]=0
71
    else: #curr[2]==0 
72
      labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
73
      curr[2]=1
74
    
75
  for k in range(1,len(segments)-1):
76
    prev = segments[k-1]
77
    curr = segments[k]
78
    next = segments[k+1]
79
    
80
    if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window and (next[1]-next[0]+1) > smoothing_window:
81
      if curr[2]==1: 
82
        labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
83
        curr[2]=0
84
      else: #curr[2]==0
85
        labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
86
        curr[2]=1
87
    
88
    
89
  prev = segments[-2]
90
  curr = segments[-1]
91
  
92
  if (curr[1]-curr[0]+1) < smoothing_window and (prev[1]-prev[0]+1) > smoothing_window:
93
    if curr[2]==1: 
94
      labels[curr[0] : (curr[1]+1)] = numpy.zeros(curr[1] - curr[0] + 1)
95
      curr[2]=0
96
    else: #if curr[2]==0
97
      labels[curr[0] : (curr[1]+1)] = numpy.ones(curr[1] - curr[0] + 1)
98
      curr[2]=1
99
       
100
  return labels
101
  
102
103
104
105
106
107
class Algorithm:
108
109
  def __init__(self):
110
    # Cepstral parameters
111
    self.win_length_ms = 20
112
    self.win_shift_ms = 10
113
    # VAD parameters
114
    self.alpha = 2
115
    self.max_iterations = 10
116
    self.smoothing_window = 10 # This corresponds to 100ms
117
    self.rate = 16000
118
119
120
  def _voice_activity_detection(self, energy_array):
121
    #########################
122
    ## Initialisation part ##
123
    #########################
124
    max_iterations = self.max_iterations
125
    alpha = self.alpha
126
    n_samples = len(energy_array)
127
128
    normalized_energy = normalize_std_array(energy_array)
129
    
130
    kmeans = bob.machine.KMeansMachine(2, 1)
131
    
132
    logger_propagate = logger.propagate
133
    # Mute logger propagation
134
    if logger_propagate:
135
      logger.propagate = False  
136
    m_ubm = bob.machine.GMMMachine(2, 1)
137
      
138
    kmeans_trainer = bob.trainer.KMeansTrainer()
139
    kmeans_trainer.convergence_threshold = 0.0005
140
    kmeans_trainer.max_iterations = max_iterations
141
    kmeans_trainer.check_no_duplicate = True
142
    
143
    # Trains using the KMeansTrainer
144
    kmeans_trainer.train(kmeans, normalized_energy)
145
    
146
    
147
    [variances, weights] = kmeans.get_variances_and_weights_for_each_cluster(normalized_energy)
148
    means = kmeans.means
149
    if numpy.isnan(means[0]) or numpy.isnan(means[1]):
150
      print("Warning: skip this file")
151
      return numpy.array(numpy.zeros(n_samples), dtype=numpy.int16)
152
    # Initializes the GMM
153
    m_ubm.means = means
154
    
155
    m_ubm.variances = variances
156
    m_ubm.weights = weights
157
    m_ubm.set_variance_thresholds(0.0005)
158
    
159
    trainer = bob.trainer.ML_GMMTrainer(True, True, True)
160
    trainer.convergence_threshold = 0.0005
161
    trainer.max_iterations = 25
162
    trainer.train(m_ubm, normalized_energy)
163
    means = m_ubm.means
164
    weights = m_ubm.weights
165
    
166
    # Enable logger propagation again
167
    if logger_propagate:
168
      logger.propagate = True
169
      
170
    if means[0] < means[1]:
171
      higher = 1
172
      lower = 0
173
    else:
174
      higher = 0
175
      lower = 1
176
    
177
    label = numpy.array(numpy.ones(n_samples), dtype=numpy.int16)
178
    
179
    higher_mean_gauss = m_ubm.update_gaussian(higher)
180
    lower_mean_gauss = m_ubm.update_gaussian(lower)
181
182
    k=0
183
    for i in range(n_samples):
184
      if higher_mean_gauss.log_likelihood(normalized_energy[i]) < lower_mean_gauss.log_likelihood( normalized_energy[i]):
185
        label[i]=0
186
      else:
187
        label[i]=label[i] * 1
188
    print("After Energy-based VAD there are %d frames remaining over %d" %(numpy.sum(label), len(label)))
189
    
190
    return label
191
192
193
  def setup(self, parameters):
194
    self.rate = parameters.get('rate', self.rate)
195
    wl = self.win_length_ms
196
    ws = self.win_shift_ms
197
    alpha = self.alpha
198
    max_iterations = self.max_iterations
199
    smoothing_window = self.smoothing_window
200
    rate = self.rate
201
    self.preprocessor = bob.ap.Energy(rate, wl, ws)
202
    return True
203
204
  
205
  def process(self, inputs, outputs):
206
    float_wav = inputs["speech"].data.value
207
    energy_array = self.preprocessor(float_wav)
208
    labels = self._voice_activity_detection(energy_array)
209
    
210
    vad_labels = smoothing(labels,10) # discard isolated speech less than 100ms
211
212
    outputs["vad"].write({
213
      'value':vad_labels
214
    })
215
216
    return True
217

The code for this algorithm in Python
The ruler at 80 columns indicate suggested POSIX line breaks (for readability).
The editor will automatically enlarge to accomodate the entirety of your input
Use keyboard shortcuts for search/replace and faster editing. For example, use Ctrl-F (PC) or Cmd-F (Mac) to search through this box

This algorithm implements the energy-based voice activity detection. It models the energy into two Gaussian distributions. It assumes that the Gaussian with the highest mean corresponds to speech whereas the the Gaussian with the lowest mean corresponds to non-speech (typically silence).

The following parameters are set inside the script:

  • win_length_ms: length of the processing window
  • win_shift_ms: length of the shift
  • max_iterations: maximum iterations of the k-means training
  • smoothing_window: smoothing window for speech detection
No experiments are using this algorithm.
Created with Raphaël 2.1.2[compare]elie_khoury/energy/1elie_khoury/energy/2Aug272014Sep6

This table shows the number of times this algorithm has been successfully run using the given environment. Note this does not provide sufficient information to evaluate if the algorithm will run when submitted to different conditions.

Terms of Service | Contact Information | BEAT platform version 2.2.1b0 | © Idiap Research Institute - 2013-2025