Source code for bob.ip.facedetect.train.Bootstrap

import numpy
import os

import bob.ip.base
import bob.learn.boosting

import logging
logger = logging.getLogger('bob.ip.facedetect')

[docs]class Bootstrap: """This class deals with selecting new training examples for each boosting round. Bootstrapping a classifier works as follows: 1. round = 1 2. the classifier is trained on a random subset of the training data, where ``number_of_positive_examples_per_round`` and ``number_of_negative_examples_per_round`` defines the number of positive and negative examples used in the first round 3. add ``number_of_weak_learners_in_first_round**round`` weak classifiers (selected using boosting) 4. evaluate the whole training set using the current set of classifiers 5. add the new data that is mis-classified by the largest margin to the training set 6. round = round + 1 7. if round < ``number_of_rounds`` goto 3 **Constructor Documentation** Creates a new Bootstrap object that can be used to start or continue the training of a bootstrapped boosted classifier. **Parameters:** ``number_of_rounds`` : int The number of bootstrapping rounds, where each round adds more weak learners to the model ``number_of_weak_learners_in_first_round`` : int The number of weak classifiers chosen in the first round; later rounds will potentate this number, so don't choose it too large ``number_of_positive_examples_per_round, number_of_negative_examples_per_round`` : int The number of positive and negative samples added in each bootstrapping round; these numbers should be balanced, but do not necessarily need to be """ def __init__(self, number_of_rounds = 7, number_of_weak_learners_in_first_round = 8, number_of_positive_examples_per_round = 5000, number_of_negative_examples_per_round = 5000): self.m_number_of_rounds = number_of_rounds self.m_number_of_weak_learners_per_round = [number_of_weak_learners_in_first_round * 2**i for i in range(number_of_rounds)] self.m_number_of_positive_examples_per_round = number_of_positive_examples_per_round self.m_number_of_negative_examples_per_round = number_of_negative_examples_per_round
[docs] def run(self, training_set, trainer, filename = "bootstrapped_model.hdf5", force = False): """run(training_set, trainer, [filename], [force]) -> model Runs the bootstrapped training of a strong classifier using the given training data and a strong classifier trainer. The training set need to contain extracted features already, as this function will need the features several times. **Parameters:** ``training_set`` : :py:class:`TrainingSet` The training set containing pre-extracted feature files ``trainer`` : :py:class:`bob.learn.boosting.Boosting` A strong boosting trainer to use for selecting the weak classifiers and their weights for each round. ``filename`` : str A filename, where to write the resulting strong classifier to. This filename is also used as a base to compute filenames of intermediate files, which store results of each of the bootstrapping steps. ``force`` : bool If set to ``False`` (the default), the bootstrapping will continue the round, where it has been stopped during the last run (reading the current stage from respective files). If set to ``True``, the training will start from the beginning. **Returns:** ``model`` : :py:class:`bob.learn.boosting.BoostedMachine` The resulting strong classifier, a weighted combination of weak classifiers. """ feature_extractor = training_set.feature_extractor() training_data = None training_labels = None model = None positive_indices, negative_indices = set(), set() for b in range(self.m_number_of_rounds): # check if old results are present temp_file = "%s_round_%d.hdf5" % (os.path.splitext(filename)[0], b+1) if os.path.exists(temp_file) and not force: logger.info("Loading already computed stage %d from %s.", b+1, temp_file) model, positives, negatives = self._load(bob.io.base.HDF5File(temp_file)) positive_indices |= positives negative_indices |= negatives else: if positive_indices or negative_indices: # load data from previous rounds logger.info("Getting training data of previous rounds") training_data, training_labels = training_set.sample(positive_indices = positive_indices, negative_indices = negative_indices) positive_indices, negative_indices = set(), set() # get data for current round logger.info("Getting new data for bootstrapping round %d", b+1) new_data, new_labels = training_set.sample(model, self.m_number_of_positive_examples_per_round, self.m_number_of_negative_examples_per_round) if training_data is None: training_data = new_data else: training_data = numpy.append(training_data, new_data, axis=0) if training_labels is None: training_labels = new_labels else: training_labels = numpy.append(training_labels, new_labels, axis=0) logger.info("Starting training with %d examples", training_data.shape[0]) model = trainer.train(training_data, training_labels, self.m_number_of_weak_learners_per_round[b], model) # write model and extractor to temporary file to be able to catch up later logger.info("Saving results for stage %d to file %s", b+1, temp_file) self._save(bob.io.base.HDF5File(temp_file, 'w'), model, training_set.positive_indices, training_set.negative_indices) feature_extractor.model_indices = model.indices # finally, return the trained model return model
def _save(self, hdf5, model, positives, negatives): """Saves the given intermediate state of the bootstrapping to file.""" # write the model and the training set indices to the given HDF5 file hdf5.set("PositiveIndices", sorted(list(positives))) hdf5.set("NegativeIndices", sorted(list(negatives))) hdf5.create_group("Model") hdf5.cd("Model") model.save(hdf5) del hdf5 def _load(self, hdf5): """Loads the intermediate state of the bootstrapping from file.""" positives = set(hdf5.get("PositiveIndices")) negatives = set(hdf5.get("NegativeIndices")) hdf5.cd("Model") model = bob.learn.boosting.BoostedMachine(hdf5) return model, positives, negatives