Source code for bob.bio.spear.extractor.Cepstral

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Elie Khoury <Elie.Khoury@idiap.ch>
# Tue  9 Jun 23:10:43 CEST 2015
#
# Copyright (C) 2012-2013 Idiap Research Institute, Martigny, Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Cepstral Features for speaker recognition"""

import numpy
import bob.ap
from .. import utils

import logging
logger = logging.getLogger("bob.bio.spear")

from bob.bio.base.extractor import Extractor


[docs]class Cepstral(Extractor):
  """ Extracts the Cepstral features """
  def __init__(
      self,
      win_length_ms = 20,
      win_shift_ms = 10,
      n_filters = 24 ,
      dct_norm = False,
      f_min = 0.0,
      f_max = 4000,
      delta_win = 2,
      mel_scale = True,
      with_energy = True,
      with_delta = True,
      with_delta_delta = True,
      n_ceps = 19, # 0-->18
      pre_emphasis_coef = 0.95,
      features_mask = numpy.arange(0,60),
      # Normalization
      normalize_flag = True,
      **kwargs
  ):
      # call base class constructor with its set of parameters
    Extractor.__init__(
        self,
        win_length_ms = win_length_ms,
        win_shift_ms = win_shift_ms,
        n_filters = n_filters,
        dct_norm = dct_norm,
        f_min = f_min,
        f_max = f_max,
        delta_win = delta_win,
        mel_scale = mel_scale,
        with_energy = with_energy,
        with_delta = with_delta,
        with_delta_delta = with_delta_delta,
        n_ceps = n_ceps,
        pre_emphasis_coef = pre_emphasis_coef,
        features_mask = features_mask,
        normalize_flag = normalize_flag,
    )
    # copy parameters
    self.win_length_ms = win_length_ms
    self.win_shift_ms = win_shift_ms
    self.n_filters = n_filters
    self.dct_norm = dct_norm
    self.f_min = f_min
    self.f_max = f_max
    self.delta_win = delta_win
    self.mel_scale = mel_scale
    self.with_energy = with_energy
    self.with_delta = with_delta
    self.with_delta_delta = with_delta_delta
    self.n_ceps = n_ceps
    self.pre_emphasis_coef = pre_emphasis_coef
    self.features_mask = features_mask
    self.normalize_flag = normalize_flag



[docs]  def normalize_features(self, params):
  #########################
  ## Initialisation part ##
  #########################

    normalized_vector = [ [ 0 for i in range(params.shape[1]) ] for j in range(params.shape[0]) ]
    for index in range(params.shape[1]):
      vector = numpy.array([row[index] for row in params])
      n_samples = len(vector)
      norm_vector = utils.normalize_std_array(vector)

      for i in range(n_samples):
        normalized_vector[i][index]=numpy.asscalar(norm_vector[i])
    data = numpy.array(normalized_vector)
    return data




  def __call__(self, input_data):
    """Computes and returns normalized cepstral features for the given input data
    input_data[0] --> sampling rate
    input_data[1] -->  sample data
    input_data[2] --> VAD array (either 0 or 1)
    """

    rate             = input_data[0]
    wavsample = input_data[1]
    vad_labels  = input_data[2]

    # Set parameters
    wl = self.win_length_ms
    ws = self.win_shift_ms
    nf = self.n_filters
    nc = self.n_ceps
    f_min = self.f_min
    f_max = self.f_max
    dw = self.delta_win
    pre = self.pre_emphasis_coef

    ceps = bob.ap.Ceps(rate, wl, ws, nf, nc, f_min, f_max, dw, pre)
    ceps.dct_norm = self.dct_norm
    ceps.mel_scale = self.mel_scale
    ceps.with_energy = self.with_energy
    ceps.with_delta = self.with_delta
    ceps.with_delta_delta = self.with_delta_delta

    cepstral_features = ceps(wavsample)
    features_mask = self.features_mask
    if vad_labels is not None: # don't apply VAD
      filtered_features = numpy.ndarray(shape=((vad_labels == 1).sum(),len(features_mask)), dtype=numpy.float64)
      i=0
      cur_i=0
      for row in cepstral_features:
        if i < len(vad_labels):
          if vad_labels[i]==1:
            for k in range(len(features_mask)):
              filtered_features[cur_i,k] = row[features_mask[k]]
            cur_i = cur_i + 1
          i = i+1
        else:
          if vad_labels[-1]==1:
            if cur_i == cepstral_features.shape[0]:
              for k in range(len(features_mask)):
                filtered_features[cur_i,k] = row[features_mask[k]]
              cur_i = cur_i + 1
          i = i+1
    else:
      filtered_features = cepstral_features

    if self.normalize_flag:
      normalized_features = self.normalize_features(filtered_features)
    else:
      normalized_features = filtered_features
    if normalized_features.shape[0] == 0:
      logger.warn("No speech found for this utterance")
      # But do not keep it empty!!! This avoids errors in next steps
      normalized_features=numpy.array([numpy.zeros(len(features_mask))])
    return normalized_features