Train a diarization system and output a model (the model must include the speaker database)

This algorithm is a autonomous one. Its process() method will only be called once, and is expected to iterate through the data incoming on its inputs itself.

Algorithms have at least one input and one output. All algorithm endpoints are organized in groups. Groups are used by the platform to indicate which inputs and outputs are synchronized together. The first group is automatically synchronized with the channel defined by the block in which the algorithm is deployed.

Group: group

Endpoint Name Data Format Nature
features system/array_2d_floats/1 Input
uem anthony_larcher/uemranges/1 Input
file_info anthony_larcher/file_info_sd/1 Input
speakers anthony_larcher/speakers/1 Input
model anthony_larcher/array_1d_uint8/1 Output
xxxxxxxxxx
362
 
1
import copy
2
import h5py
3
import numpy
4
import pickle
5
import random
6
import sidekit
7
import struct
8
import s4d
9
import os
10
import logging
11
12
class H5Dict(dict):
13
    """ 
14
    A dictionary sub-class used to exchange features between
15
    algorithms.
16
    """
17
18
    def __setitem__(self, key, value):
19
        super().__setitem__(key, value)
20
21
    def __getitem__(self, key):
22
        return super().__getitem__(key)
23
24
    def __iter__(self):
25
        return iter(["cep",])
26
27
28
class AlliesExtractor(sidekit.FeaturesExtractor):
29
    """
30
    A FeaturesExtractor is used to access cepstral coefficients
31
    """
32
33
    def __init__(self, dataloader, name_dict, key="features"):
34
        super().__init__()
35
        self.dataloader = dataloader
36
        self.name_dict = name_dict
37
        self.audio_filename_structure = "{}"
38
        self.key = key
39
40
    def extract(self, show, channel, input_audio_filename):
41
        entry, _, _ = self.dataloader[self.name_dict[show]]
42
        cep = entry[self.key].value
43
        label = numpy.ones((cep.shape[0]), dtype=bool)
44
45
        h5f = H5Dict()
46
        h5f[show] = iter(["cep"])
47
        h5f["compression"] = 0
48
        h5f[show + "/energy"] = cep[:, 0]
49
        h5f[show + "/energy_mean"] = cep[:, 0].mean()
50
        h5f[show + "/energy_std"] = cep[:, 0].std()
51
        h5f[show + "/cep"] = cep[:, 1:]
52
        h5f[show + "/cep_mean"] = cep[:, 1:].mean(1)
53
        h5f[show + "/cep_std"] = cep[:, 1:].std(1)
54
        h5f[show + "/vad"] = label.astype("int8")
55
        return h5f
56
57
58
def setup_for_s4d(data_loaders):
59
    """
60
61
    """
62
    # The laoder is the interface used to acces inputs of this algorithmic block
63
    loader = data_loaders.loaderOf("features")
64
65
    # The s4d.Diar stores information from the MDTM
66
    # about each speech segments :
67
    #   file_id
68
    #   cluster which is used to store speaker index
69
    #   start_time
70
    #   stop_time
71
    global_diar = s4d.diar.Diar()
72
    global_diar.add_attribut(new_attribut="gender", default="U")
73
74
    # Fill the s4d.Diar object and a dictionnary "name_dict"
75
    # that is used to access audio_file per file_id
76
    name_dict = {}
77
    for i in range(loader.count()):
78
        (data, _, end_index) = loader[i]
79
        speech_segments = data["speakers"]
80
        file_id = data["file_info"].file_id
81
        supervision = data["file_info"].supervision
82
        time_stamp = data["file_info"].time_stamp
83
84
        for idx in range(speech_segments.speaker.size):
85
            global_diar.append(
86
                show=file_id,
87
                cluster=speech_segments.speaker[idx],
88
                start=int(round(speech_segments.start_time[idx] * 100, 0)),
89
                stop=int(round(speech_segments.end_time[idx] * 100, 0)),
90
            )
91
        name_dict[file_id] = int(i)
92
93
    # Create two obejct to access acoustic features
94
    fe_train = AlliesExtractor(loader, name_dict)
95
    fs_acoustic = sidekit.FeaturesServer(
96
        features_extractor=fe_train,
97
        dataset_list=["energy", "cep"],
98
        keep_all_features=True,
99
        delta=True,
100
        double_delta=True,
101
        feat_norm="cmvn_sliding"
102
    )
103
104
    # Get the feature size from the first file
105
    feature_size = fe_train.extract(
106
                    list(name_dict.keys())[0], 0, list(name_dict.keys())[0]
107
                    )[list(name_dict.keys())[0] + "/cep"].shape[1] * 3 + 3
108
109
    return global_diar, fs_acoustic, feature_size, end_index
110
111
112
113
def aggregate_models(ubm, tv_fa, norm_mean, norm_cov, plda_fa, iv_stat, global_diar):
114
    """
115
    Agregate components of the acoustic model to exchange between algorithms.
116
    The model is Pickled and converted into a string
117
    """
118
    model = dict()
119
    model["ubm"] = ubm
120
    model["tv_fa"] = tv_fa
121
    model["norm_mean"] = norm_mean
122
    model["norm_cov"] = norm_cov
123
    model["plda_fa"] = plda_fa
124
    model["iv_stat"] = iv_stat
125
    model["global_diar"] = global_diar
126
127
    #pickle.dump(model, open( "model_beat.p", "wb" ))
128
    pkl = pickle.dumps(model)
129
    u8 = numpy.array(struct.unpack("{}B".format(len(pkl)), pkl), dtype=numpy.uint8)
130
    return u8
131
132
133
def keep_recurring_speakers(iv_ss, rank_F, occ_number=2, filter_by_name=False):
134
    """
135
136
    """
137
    unique, counts = numpy.unique(iv_ss.modelset, return_counts=True)
138
    tot_sessions = 0
139
    kept_idx = []
140
    for model in range(unique.shape[0]):
141
        if counts[model] >= occ_number and (
142
            (
143
                not (unique[model]).startswith("speaker")
144
                and not (unique[model]).startswith("presentat")
145
                and not (unique[model]).startswith("sup+")
146
                and not (unique[model]).startswith("voix")
147
                and not (unique[model]).startswith("publicitaire")
148
                and not (unique[model]).startswith("locuteur")
149
                and not (unique[model]).startswith("inconnu")
150
                and not (unique[model]).startswith("traduct")
151
                and not ("#" in (unique[model]))
152
                and "_" in (unique[model])
153
                and filter_by_name
154
            )
155
            or (not filter_by_name)
156
        ):
157
            kept_idx = numpy.append(
158
                kept_idx, numpy.where(iv_ss.modelset == unique[model])
159
            )
160
            tot_sessions += counts[model]
161
    #
162
    flt_iv = sidekit.StatServer()
163
    flt_iv.stat0 = numpy.ones((tot_sessions, 1))
164
    flt_iv.stat1 = numpy.ones((tot_sessions, rank_F))
165
    flt_iv.modelset = numpy.empty(tot_sessions, dtype="|O")
166
    flt_iv.segset = numpy.empty(tot_sessions, dtype="|O")
167
    flt_iv.start = numpy.empty(tot_sessions, dtype="|O")
168
    flt_iv.stop = numpy.empty(tot_sessions, dtype="|O")
169
    #
170
    i = 0
171
    for model in range(unique.shape[0]):
172
        if counts[model] >= occ_number and (
173
            (
174
                not (unique[model]).startswith("speaker")
175
                and not (unique[model]).startswith("presentat")
176
                and not (unique[model]).startswith("sup+")
177
                and not (unique[model]).startswith("voix")
178
                and not (unique[model]).startswith("publicitaire")
179
                and not (unique[model]).startswith("locuteur")
180
                and not (unique[model]).startswith("inconnu")
181
                and not (unique[model]).startswith("traduct")
182
                and not ("#" in (unique[model]))
183
                and "_" in (unique[model])
184
                and filter_by_name
185
            )
186
            or (not filter_by_name)
187
        ):
188
            for ivector in numpy.where(iv_ss.modelset == unique[model])[0]:
189
                flt_iv.stat1[i, :] = iv_ss.stat1[ivector, :]
190
                flt_iv.modelset[i] = iv_ss.modelset[ivector]
191
                flt_iv.segset[i] = iv_ss.segset[ivector]
192
                flt_iv.start[i] = iv_ss.start[ivector]
193
                flt_iv.stop[i] = iv_ss.stop[ivector]
194
                i += 1
195
    unique = numpy.unique(flt_iv.modelset)
196
    return flt_iv, unique.shape[0], kept_idx
197
198
199
200
201
class Algorithm:
202
    # initialise fields to store cross-input data (e.g. machines, aggregations, etc.)
203
    def __init__(self):
204
        """
205
        Set the meta-parameters of the model
206
        """
207
        self.distrib_nb = 16 #256
208
        self.iv_size = 50 #150
209
        self.plda_rank = 10 #50
210
        self.feature_size = None
211
        self.tv_iteration_nb = 5
212
213
    # this will be called each time the sync'd input has more data available to be processed
214
    def process(self, data_loaders, outputs):
215
        """
216
        Use the training data provided through the dataloader in order to 
217
        train the acoustic model and return it
218
219
        :param data_loader: input parameters that is used to access all incoming data
220
        :param outputs: parameter to send the model to the next block
221
        """
222
        logging.debug("modif emcfo;ecjkiescgfsrg")
223
        # The laoder is the interface used to acces inputs of this algorithmic block
224
        loader = data_loaders.loaderOf("features")
225
        
226
        global_diar, fs_acoustic, feature_size, end_index = setup_for_s4d(data_loaders)
227
        self.feature_size = feature_size
228
229
        ############################################################################
230
        # Start training the acoustic system.
231
        # A description of the system can be found at ****************
232
        # AJOUTER UN LIEN VERS LE DESCRIPTIF DU SYSTEME
233
        
234
        # Train UBM on the first 1000 speech segments
235
        ubm_diar = copy.deepcopy(global_diar)
236
        ubm_diar.segments = ubm_diar.segments[:100]
237
        ubm_idmap = ubm_diar.id_map()
238
        ubm = sidekit.Mixture()
239
        ubm.EM_split(fs_acoustic, ubm_idmap, self.distrib_nb, num_thread=1, save_partial=False)
240
        # Select a maximum of 1000 segments of speech of more than 1s duration
241
        # to train the Total Variability matrix
242
        tv_diar = global_diar.copy_structure()
243
        tv_diar.segments = random.sample(global_diar.filter("duration", ">=", 100).segments, 
244
                         k= min(len(global_diar.filter("duration", ">=", 100)), 1000))
245
        tv_idmap = tv_diar.id_map()
246
        # Get the list of unused segments that are more than 1s
247
        long_seg_diar = global_diar.copy_structure()
248
        for seg in global_diar.filter("duration", ">=", 100).segments:
249
            if not seg in tv_diar:
250
                long_seg_diar.append_seg(seg)
251
        long_seg_idmap = long_seg_diar.id_map()
252
253
        # Shorter segments are preserved aside for clustering purpose
254
        short_seg_diar = global_diar.filter("duration", "<", 100)
255
        short_seg_idmap = short_seg_diar.id_map()
256
        # Accumulate sufficient statistics for the training data
257
        
258
        tv_stat = sidekit.StatServer(
259
            tv_idmap,
260
            distrib_nb=ubm.get_distrib_nb(),
261
            feature_size=self.feature_size
262
        )
263
264
        long_seg_stat = sidekit.StatServer(
265
            long_seg_idmap,
266
            distrib_nb=ubm.get_distrib_nb(),
267
            feature_size=self.feature_size
268
        )
269
270
        short_seg_stat = sidekit.StatServer(
271
            short_seg_idmap,
272
            distrib_nb=ubm.get_distrib_nb(),
273
            feature_size=self.feature_size
274
        )
275
        tv_stat.accumulate_stat(
276
            ubm=ubm,
277
            feature_server=fs_acoustic,
278
            seg_indices=range(tv_stat.segset.shape[0]),
279
            num_thread=1
280
        )
281
282
        long_seg_stat.accumulate_stat(
283
            ubm=ubm,
284
            feature_server=fs_acoustic,
285
            seg_indices=range(long_seg_stat.segset.shape[0]),
286
            num_thread=1
287
        )
288
289
        short_seg_stat.accumulate_stat(
290
            ubm=ubm,
291
            feature_server=fs_acoustic,
292
            seg_indices=range(short_seg_stat.segset.shape[0]),
293
            num_thread=1
294
        )
295
296
        # Sufficient statistics for selected for training are passed to the FactorAnalyser
297
        # via an HDF5 file handler that stays in memory
298
        tv_stat_h5f = h5py.File("tv.h5", "a", backing_store=False, driver="core")
299
        tv_stat.to_hdf5(tv_stat_h5f)
300
        tv_fa = sidekit.FactorAnalyser()
301
        tv_fa.total_variability(
302
            tv_stat_h5f,
303
            ubm,
304
            tv_rank=self.iv_size,
305
            nb_iter=self.tv_iteration_nb,
306
            min_div=True,
307
            tv_init=None,
308
            batch_size=300,
309
            save_init=False,
310
            output_file_name=None,
311
            num_thread=1,
312
        )
313
314
        # Extract i-vectors for plda training
315
        tv_iv = tv_fa.extract_ivectors_single(
316
            ubm, tv_stat, uncertainty=False
317
        )
318
319
        tmp_long_seg_iv = tv_fa.extract_ivectors_single(
320
            ubm, long_seg_stat, uncertainty=False
321
        )
322
323
        short_seg_iv = tv_fa.extract_ivectors_single(
324
            ubm, short_seg_stat, uncertainty=False
325
        )
326
327
        long_seg_iv = tv_iv.merge(tmp_long_seg_iv)
328
        global_iv = long_seg_iv.merge(short_seg_iv)
329
330
        # Train the Probabilistic Linear Discriminant Analyser
331
        # First long i-vectors  are normalized
332
        norm_mean = norm_cov = None
333
        norm_mean, norm_cov = long_seg_iv.estimate_spectral_norm_stat1(1, "sphNorm")
334
335
        # Filter the i-vector set in order to keep only recurrent speakers to train the PLDA model
336
        norm_iv, n_recc, _ = keep_recurring_speakers(
337
            long_seg_iv, rank_F=self.iv_size, occ_number=2, filter_by_name=True
338
        )
339
340
        norm_iv.spectral_norm_stat1(norm_mean, norm_cov)
341
342
        # EM algorithm to train the PLDA model
343
        plda_fa = sidekit.FactorAnalyser()
344
        plda_fa.plda(
345
            norm_iv,
346
            rank_f=self.plda_rank,
347
            nb_iter=10,
348
            scaling_factor=1.0,
349
            output_file_name=None,
350
            save_partial=False,
351
            save_final=False
352
        )
353
        
354
        # Return the model in a format that can be pickled
355
        model = aggregate_models(
356
            ubm, tv_fa, norm_mean, norm_cov, plda_fa, global_iv, global_diar
357
        )
358
        outputs["model"].write({"value": model}, end_index)
359
        return True
360
361
362

The code for this algorithm in Python
The ruler at 80 columns indicate suggested POSIX line breaks (for readability).
The editor will automatically enlarge to accomodate the entirety of your input
Use keyboard shortcuts for search/replace and faster editing. For example, use Ctrl-F (PC) or Cmd-F (Mac) to search through this box

Could not find any documentation for this object.
No experiments are using this algorithm.
Created with Raphaël 2.1.2[compare]anthony_larcher/diar_train/12020Mar9

This table shows the number of times this algorithm has been successfully run using the given environment. Note this does not provide sufficient information to evaluate if the algorithm will run when submitted to different conditions.

Terms of Service | Contact Information | BEAT platform version 2.2.1b0 | © Idiap Research Institute - 2013-2025