Source code for bob.med.tb.utils.measure

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from collections import deque
import numpy
import torch
import scipy.special


[docs]class SmoothedValue:
    """Track a series of values and provide access to smoothed values over a
    window or the global series average.
    """

    def __init__(self, window_size=20):
        self.deque = deque(maxlen=window_size)

[docs]    def update(self, value):
        self.deque.append(value)

    @property
    def median(self):
        d = torch.tensor(list(self.deque))
        return d.median().item()

    @property
    def avg(self):
        d = torch.tensor(list(self.deque))
        return d.mean().item()


[docs]def tricky_division(n, d):
    """Divides n by d.  Returns 0.0 in case of a division by zero"""

    return n/(d+(d==0))


[docs]def base_measures(tp, fp, tn, fn):
    """Calculates measures from true/false positive and negative counts

    This function can return standard machine learning measures from true and
    false positive counts of positives and negatives.  For a thorough look into
    these and alternate names for the returned values, please check Wikipedia's
    entry on `Precision and Recall
    <https://en.wikipedia.org/wiki/Precision_and_recall>`_.


    Parameters
    ----------

    tp : int
        True positive count, AKA "hit"

    fp : int
        False positive count, AKA, "correct rejection"

    tn : int
        True negative count, AKA "false alarm", or "Type I error"

    fn : int
        False Negative count, AKA "miss", or "Type II error"


    Returns
    -------

    precision : float
        P, AKA positive predictive value (PPV).  It corresponds arithmetically
        to ``tp/(tp+fp)``.  In the case ``tp+fp == 0``, this function returns
        zero for precision.

    recall : float
        R, AKA sensitivity, hit rate, or true positive rate (TPR).  It
        corresponds arithmetically to ``tp/(tp+fn)``.  In the special case
        where ``tp+fn == 0``, this function returns zero for recall.

    specificity : float
        S, AKA selectivity or true negative rate (TNR).  It
        corresponds arithmetically to ``tn/(tn+fp)``.  In the special case
        where ``tn+fp == 0``, this function returns zero for specificity.

    accuracy : float
        A, see `Accuracy
        <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
        the proportion of correct predictions (both true positives and true
        negatives) among the total number of pixels examined.  It corresponds
        arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``.  This measure includes
        both true-negatives and positives in the numerator, what makes it
        sensitive to data or regions without annotations.

    jaccard : float
        J, see `Jaccard Index or Similarity
        <https://en.wikipedia.org/wiki/Jaccard_index>`_.  It corresponds
        arithmetically to ``tp/(tp+fp+fn)``.  In the special case where
        ``tn+fp+fn == 0``, this function returns zero for the Jaccard index.
        The Jaccard index depends on a TP-only numerator, similarly to the F1
        score.  For regions where there are no annotations, the Jaccard index
        will always be zero, irrespective of the model output.  Accuracy may be
        a better proxy if one needs to consider the true abscence of
        annotations in a region as part of the measure.

    f1_score : float
        F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_.  It
        corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.
        In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function
        returns zero for the Jaccard index.  The F1 or Dice score depends on a
        TP-only numerator, similarly to the Jaccard index.  For regions where
        there are no annotations, the F1-score will always be zero,
        irrespective of the model output.  Accuracy may be a better proxy if
        one needs to consider the true abscence of annotations in a region as
        part of the measure.

    """

    return (
            tricky_division(tp, tp + fp),                #precision
            tricky_division(tp, tp + fn),                #recall
            tricky_division(tn, fp + tn),                #specificity
            tricky_division(tp + tn, tp + fp + fn + tn), #accuracy
            tricky_division(tp, tp + fp + fn),           #jaccard index
            tricky_division(2*tp, (2*tp) + fp + fn),     #f1-score
            )

[docs]def beta_credible_region(k, l, lambda_, coverage):
    """
    Returns the mode, upper and lower bounds of the equal-tailed credible
    region of a probability estimate following Bernoulli trials.

    This implemetnation is based on [GOUTTE-2005]_.  It assumes :math:`k`
    successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
    from a series of Bernoulli trials (likelihood is binomial).  The posterior
    is derivated using the Bayes Theorem with a beta prior.  As there is no
    reason to favour high vs.  low precision, we use a symmetric Beta prior
    (:math:`\\alpha=\\beta`):

    .. math::

       P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\
       P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\
       P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\
       P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1}

    The mode for this posterior (also the maximum a posteriori) is:

    .. math::

       \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2}

    Concretely, the prior may be flat (all rates are equally likely,
    :math:`\\lambda=1`) or we may use Jeoffrey's prior
    (:math:`\\lambda=0.5`), that is invariant through re-parameterisation.
    Jeffrey's prior indicate that rates close to zero or one are more likely.

    The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is
    usually the case for a resonably well tunned system, with more than a few
    samples for analysis.  In the limit of the system performance, :math:`k`
    may be 0, which will make the mode become zero.

    For our purposes, it may be more suitable to represent :math:`n = k + l`,
    with :math:`k`, the number of successes and :math:`l`, the number of
    failures in the binomial experiment, and find this more suitable
    representation:

    .. math::

       P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\
       \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2}

    This can be mapped to most rates calculated in the context of binary
    classification this way:

    * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
    * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
    * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
    * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN
    * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
    * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN

    Contrary to frequentist approaches, in which one can only
    say that if the test were repeated an infinite number of times,
    and one constructed a confidence interval each time, then X%
    of the confidence intervals would contain the true rate, here
    we can say that given our observed data, there is a X% probability
    that the true value of :math:`k/n` falls within the provided
    interval.


    .. note::

       For a disambiguation with Confidence Interval, read
       https://en.wikipedia.org/wiki/Credible_interval.


    Parameters
    ==========

    k : int
        Number of successes observed on the experiment

    l : int
        Number of failures observed on the experiment

    lambda__ : :py:class:`float`, Optional
        The parameterisation of the Beta prior to consider. Use
        :math:`\\lambda=1` for a flat prior.  Use :math:`\\lambda=0.5` for
        Jeffrey's prior (the default).

    coverage : :py:class:`float`, Optional
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95%
        of the area under the probability density of the posterior
        is covered by the returned equal-tailed interval.


    Returns
    =======

    mean : float
        The mean of the posterior distribution

    mode : float
        The mode of the posterior distribution

    lower, upper: float
        The lower and upper bounds of the credible region

    """

    # we return the equally-tailed range
    right = (1.0-coverage)/2  #half-width in each side
    lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right)
    upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right)

    # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
    alpha = k+lambda_
    beta = l+lambda_

    E = alpha / (alpha + beta)

    # the mode of a beta distribution is a bit tricky
    if alpha > 1 and beta > 1:
        mode = (alpha-1) / (alpha+beta-2)
    elif alpha == 1 and beta == 1:
        # In the case of precision, if the threshold is close to 1.0, both TP
        # and FP can be zero, which may cause this condition to be reached, if
        # the prior is exactly 1 (flat prior).  This is a weird situation,
        # because effectively we are trying to compute the posterior when the
        # total number of experiments is zero.  So, only the prior counts - but
        # the prior is flat, so we should just pick a value.  We choose the
        # middle of the range.
        mode = 0.0  #any value would do, we just pick this one
    elif alpha <= 1 and beta > 1:
        mode = 0.0
    elif alpha > 1 and beta <= 1:
        mode = 1.0
    else: #elif alpha < 1 and beta < 1:
        # in the case of precision, if the threshold is close to 1.0, both TP
        # and FP can be zero, which may cause this condition to be reached, if
        # the prior is smaller than 1.  This is a weird situation, because
        # effectively we are trying to compute the posterior when the total
        # number of experiments is zero.  So, only the prior counts - but the
        # prior is bimodal, so we should just pick a value.  We choose the
        # left of the range.
        mode = 0.0  #could also be 1.0 as the prior is bimodal

    return E, mode, lower, upper


[docs]def bayesian_measures(tp, fp, tn, fn, lambda_, coverage):
    """Calculates mean and mode from true/false positive and negative counts
    with credible regions

    This function can return bayesian estimates of standard machine learning
    measures from true and false positive counts of positives and negatives.
    For a thorough look into these and alternate names for the returned values,
    please check Wikipedia's entry on `Precision and Recall
    <https://en.wikipedia.org/wiki/Precision_and_recall>`_.  See
    :py:func:`beta_credible_region` for details on the calculation of returned
    values.


    Parameters
    ----------

    tp : int
        True positive count, AKA "hit"

    fp : int
        False positive count, AKA "false alarm", or "Type I error"

    tn : int
        True negative count, AKA "correct rejection"

    fn : int
        False Negative count, AKA "miss", or "Type II error"

    lambda_ : float
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    coverage : float
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95%
        of the area under the probability density of the posterior
        is covered by the returned equal-tailed interval.



    Returns
    -------

    precision : (float, float, float, float)
        P, AKA positive predictive value (PPV), mean, mode and credible
        intervals (95% CI).  It corresponds arithmetically
        to ``tp/(tp+fp)``.

    recall : (float, float, float, float)
        R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode
        and credible intervals (95% CI).  It corresponds arithmetically to
        ``tp/(tp+fn)``.

    specificity : (float, float, float, float)
        S, AKA selectivity or true negative rate (TNR), mean, mode and credible
        intervals (95% CI).  It corresponds arithmetically to ``tn/(tn+fp)``.

    accuracy : (float, float, float, float)
        A, mean, mode and credible intervals (95% CI).  See `Accuracy
        <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
        the proportion of correct predictions (both true positives and true
        negatives) among the total number of pixels examined.  It corresponds
        arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``.  This measure includes
        both true-negatives and positives in the numerator, what makes it
        sensitive to data or regions without annotations.

    jaccard : (float, float, float, float)
        J, mean, mode and credible intervals (95% CI).  See `Jaccard Index or
        Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_.  It
        corresponds arithmetically to ``tp/(tp+fp+fn)``.  The Jaccard index
        depends on a TP-only numerator, similarly to the F1 score.  For regions
        where there are no annotations, the Jaccard index will always be zero,
        irrespective of the model output.  Accuracy may be a better proxy if
        one needs to consider the true abscence of annotations in a region as
        part of the measure.

    f1_score : (float, float, float, float)
        F1, mean, mode and credible intervals (95% CI). See `F1-score
        <https://en.wikipedia.org/wiki/F1_score>`_.  It corresponds
        arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.  The F1 or
        Dice score depends on a TP-only numerator, similarly to the Jaccard
        index.  For regions where there are no annotations, the F1-score will
        always be zero, irrespective of the model output.  Accuracy may be a
        better proxy if one needs to consider the true abscence of annotations
        in a region as part of the measure.
    """

    return (
            beta_credible_region(tp, fp, lambda_, coverage),  #precision
            beta_credible_region(tp, fn, lambda_, coverage),  #recall
            beta_credible_region(tn, fp, lambda_, coverage),  #specificity
            beta_credible_region(tp+tn, fp+fn, lambda_, coverage),  #accuracy
            beta_credible_region(tp, fp+fn, lambda_, coverage),  #jaccard index
            beta_credible_region(2*tp, fp+fn, lambda_, coverage),  #f1-score
            )

[docs]def get_centered_maxf1(f1_scores, thresholds):
    """Return the centered max F1 score threshold when multiple threshold
    give the same max F1 score


    Parameters
    ----------

    f1_scores : numpy.ndarray
        1D array of f1 scores

    thresholds : numpy.ndarray
        1D array of thresholds

    Returns
    -------

    max F1 score: float

    threshold: float

    """
    maxf1 = f1_scores.max()
    maxf1_indices = numpy.where(f1_scores == maxf1)[0]
    
    # If multiple thresholds give the same max F1 score
    if len(maxf1_indices) > 1:
        mean_maxf1_index = int(round(numpy.mean(maxf1_indices)))
    else:
        mean_maxf1_index = maxf1_indices[0]
    
    return maxf1, thresholds[mean_maxf1_index]