Source code for bob.med.tb.utils.measure

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from collections import deque
import numpy
import torch
import scipy.special


[docs]class SmoothedValue: """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20): self.deque = deque(maxlen=window_size)
[docs] def update(self, value): self.deque.append(value)
@property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque)) return d.mean().item()
[docs]def tricky_division(n, d): """Divides n by d. Returns 0.0 in case of a division by zero""" return n/(d+(d==0))
[docs]def base_measures(tp, fp, tn, fn): """Calculates measures from true/false positive and negative counts This function can return standard machine learning measures from true and false positive counts of positives and negatives. For a thorough look into these and alternate names for the returned values, please check Wikipedia's entry on `Precision and Recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_. Parameters ---------- tp : int True positive count, AKA "hit" fp : int False positive count, AKA, "correct rejection" tn : int True negative count, AKA "false alarm", or "Type I error" fn : int False Negative count, AKA "miss", or "Type II error" Returns ------- precision : float P, AKA positive predictive value (PPV). It corresponds arithmetically to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns zero for precision. recall : float R, AKA sensitivity, hit rate, or true positive rate (TPR). It corresponds arithmetically to ``tp/(tp+fn)``. In the special case where ``tp+fn == 0``, this function returns zero for recall. specificity : float S, AKA selectivity or true negative rate (TNR). It corresponds arithmetically to ``tn/(tn+fp)``. In the special case where ``tn+fp == 0``, this function returns zero for specificity. accuracy : float A, see `Accuracy <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is the proportion of correct predictions (both true positives and true negatives) among the total number of pixels examined. It corresponds arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes both true-negatives and positives in the numerator, what makes it sensitive to data or regions without annotations. jaccard : float J, see `Jaccard Index or Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds arithmetically to ``tp/(tp+fp+fn)``. In the special case where ``tn+fp+fn == 0``, this function returns zero for the Jaccard index. The Jaccard index depends on a TP-only numerator, similarly to the F1 score. For regions where there are no annotations, the Jaccard index will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. f1_score : float F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function returns zero for the Jaccard index. The F1 or Dice score depends on a TP-only numerator, similarly to the Jaccard index. For regions where there are no annotations, the F1-score will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. """ return ( tricky_division(tp, tp + fp), #precision tricky_division(tp, tp + fn), #recall tricky_division(tn, fp + tn), #specificity tricky_division(tp + tn, tp + fp + fn + tn), #accuracy tricky_division(tp, tp + fp + fn), #jaccard index tricky_division(2*tp, (2*tp) + fp + fn), #f1-score )
[docs]def beta_credible_region(k, l, lambda_, coverage): """ Returns the mode, upper and lower bounds of the equal-tailed credible region of a probability estimate following Bernoulli trials. This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k` successes and :math:`l` failures (:math:`n = k+l` total trials) are issued from a series of Bernoulli trials (likelihood is binomial). The posterior is derivated using the Bayes Theorem with a beta prior. As there is no reason to favour high vs. low precision, we use a symmetric Beta prior (:math:`\\alpha=\\beta`): .. math:: P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\ P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\ P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\ P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1} The mode for this posterior (also the maximum a posteriori) is: .. math:: \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2} Concretely, the prior may be flat (all rates are equally likely, :math:`\\lambda=1`) or we may use Jeoffrey's prior (:math:`\\lambda=0.5`), that is invariant through re-parameterisation. Jeffrey's prior indicate that rates close to zero or one are more likely. The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is usually the case for a resonably well tunned system, with more than a few samples for analysis. In the limit of the system performance, :math:`k` may be 0, which will make the mode become zero. For our purposes, it may be more suitable to represent :math:`n = k + l`, with :math:`k`, the number of successes and :math:`l`, the number of failures in the binomial experiment, and find this more suitable representation: .. math:: P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\ \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2} This can be mapped to most rates calculated in the context of binary classification this way: * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN Contrary to frequentist approaches, in which one can only say that if the test were repeated an infinite number of times, and one constructed a confidence interval each time, then X% of the confidence intervals would contain the true rate, here we can say that given our observed data, there is a X% probability that the true value of :math:`k/n` falls within the provided interval. .. note:: For a disambiguation with Confidence Interval, read https://en.wikipedia.org/wiki/Credible_interval. Parameters ========== k : int Number of successes observed on the experiment l : int Number of failures observed on the experiment lambda__ : :py:class:`float`, Optional The parameterisation of the Beta prior to consider. Use :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for Jeffrey's prior (the default). coverage : :py:class:`float`, Optional A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ======= mean : float The mean of the posterior distribution mode : float The mode of the posterior distribution lower, upper: float The lower and upper bounds of the credible region """ # we return the equally-tailed range right = (1.0-coverage)/2 #half-width in each side lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right) upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right) # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution) alpha = k+lambda_ beta = l+lambda_ E = alpha / (alpha + beta) # the mode of a beta distribution is a bit tricky if alpha > 1 and beta > 1: mode = (alpha-1) / (alpha+beta-2) elif alpha == 1 and beta == 1: # In the case of precision, if the threshold is close to 1.0, both TP # and FP can be zero, which may cause this condition to be reached, if # the prior is exactly 1 (flat prior). This is a weird situation, # because effectively we are trying to compute the posterior when the # total number of experiments is zero. So, only the prior counts - but # the prior is flat, so we should just pick a value. We choose the # middle of the range. mode = 0.0 #any value would do, we just pick this one elif alpha <= 1 and beta > 1: mode = 0.0 elif alpha > 1 and beta <= 1: mode = 1.0 else: #elif alpha < 1 and beta < 1: # in the case of precision, if the threshold is close to 1.0, both TP # and FP can be zero, which may cause this condition to be reached, if # the prior is smaller than 1. This is a weird situation, because # effectively we are trying to compute the posterior when the total # number of experiments is zero. So, only the prior counts - but the # prior is bimodal, so we should just pick a value. We choose the # left of the range. mode = 0.0 #could also be 1.0 as the prior is bimodal return E, mode, lower, upper
[docs]def bayesian_measures(tp, fp, tn, fn, lambda_, coverage): """Calculates mean and mode from true/false positive and negative counts with credible regions This function can return bayesian estimates of standard machine learning measures from true and false positive counts of positives and negatives. For a thorough look into these and alternate names for the returned values, please check Wikipedia's entry on `Precision and Recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See :py:func:`beta_credible_region` for details on the calculation of returned values. Parameters ---------- tp : int True positive count, AKA "hit" fp : int False positive count, AKA "false alarm", or "Type I error" tn : int True negative count, AKA "correct rejection" fn : int False Negative count, AKA "miss", or "Type II error" lambda_ : float The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage : float A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- precision : (float, float, float, float) P, AKA positive predictive value (PPV), mean, mode and credible intervals (95% CI). It corresponds arithmetically to ``tp/(tp+fp)``. recall : (float, float, float, float) R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode and credible intervals (95% CI). It corresponds arithmetically to ``tp/(tp+fn)``. specificity : (float, float, float, float) S, AKA selectivity or true negative rate (TNR), mean, mode and credible intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``. accuracy : (float, float, float, float) A, mean, mode and credible intervals (95% CI). See `Accuracy <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is the proportion of correct predictions (both true positives and true negatives) among the total number of pixels examined. It corresponds arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes both true-negatives and positives in the numerator, what makes it sensitive to data or regions without annotations. jaccard : (float, float, float, float) J, mean, mode and credible intervals (95% CI). See `Jaccard Index or Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index depends on a TP-only numerator, similarly to the F1 score. For regions where there are no annotations, the Jaccard index will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. f1_score : (float, float, float, float) F1, mean, mode and credible intervals (95% CI). See `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or Dice score depends on a TP-only numerator, similarly to the Jaccard index. For regions where there are no annotations, the F1-score will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. """ return ( beta_credible_region(tp, fp, lambda_, coverage), #precision beta_credible_region(tp, fn, lambda_, coverage), #recall beta_credible_region(tn, fp, lambda_, coverage), #specificity beta_credible_region(tp+tn, fp+fn, lambda_, coverage), #accuracy beta_credible_region(tp, fp+fn, lambda_, coverage), #jaccard index beta_credible_region(2*tp, fp+fn, lambda_, coverage), #f1-score )
[docs]def get_centered_maxf1(f1_scores, thresholds): """Return the centered max F1 score threshold when multiple threshold give the same max F1 score Parameters ---------- f1_scores : numpy.ndarray 1D array of f1 scores thresholds : numpy.ndarray 1D array of thresholds Returns ------- max F1 score: float threshold: float """ maxf1 = f1_scores.max() maxf1_indices = numpy.where(f1_scores == maxf1)[0] # If multiple thresholds give the same max F1 score if len(maxf1_indices) > 1: mean_maxf1_index = int(round(numpy.mean(maxf1_indices))) else: mean_maxf1_index = maxf1_indices[0] return maxf1, thresholds[mean_maxf1_index]