1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4from collections import deque
5import numpy
6import torch
7import scipy.special
8
9
10class SmoothedValue:
11 """Track a series of values and provide access to smoothed values over a
12 window or the global series average.
13 """
14
15 def __init__(self, window_size=20):
16 self.deque = deque(maxlen=window_size)
17
18 def update(self, value):
19 self.deque.append(value)
20
21 @property
22 def median(self):
23 d = torch.tensor(list(self.deque))
24 return d.median().item()
25
26 @property
27 def avg(self):
28 d = torch.tensor(list(self.deque))
29 return d.mean().item()
30
31
32def tricky_division(n, d):
33 """Divides n by d. Returns 0.0 in case of a division by zero"""
34
35 return n/(d+(d==0))
36
37
38def base_measures(tp, fp, tn, fn):
39 """Calculates measures from true/false positive and negative counts
40
41 This function can return standard machine learning measures from true and
42 false positive counts of positives and negatives. For a thorough look into
43 these and alternate names for the returned values, please check Wikipedia's
44 entry on `Precision and Recall
45 <https://en.wikipedia.org/wiki/Precision_and_recall>`_.
46
47
48 Parameters
49 ----------
50
51 tp : int
52 True positive count, AKA "hit"
53
54 fp : int
55 False positive count, AKA, "correct rejection"
56
57 tn : int
58 True negative count, AKA "false alarm", or "Type I error"
59
60 fn : int
61 False Negative count, AKA "miss", or "Type II error"
62
63
64 Returns
65 -------
66
67 precision : float
68 P, AKA positive predictive value (PPV). It corresponds arithmetically
69 to ``tp/(tp+fp)``. In the case ``tp+fp == 0``, this function returns
70 zero for precision.
71
72 recall : float
73 R, AKA sensitivity, hit rate, or true positive rate (TPR). It
74 corresponds arithmetically to ``tp/(tp+fn)``. In the special case
75 where ``tp+fn == 0``, this function returns zero for recall.
76
77 specificity : float
78 S, AKA selectivity or true negative rate (TNR). It
79 corresponds arithmetically to ``tn/(tn+fp)``. In the special case
80 where ``tn+fp == 0``, this function returns zero for specificity.
81
82 accuracy : float
83 A, see `Accuracy
84 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
85 the proportion of correct predictions (both true positives and true
86 negatives) among the total number of pixels examined. It corresponds
87 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes
88 both true-negatives and positives in the numerator, what makes it
89 sensitive to data or regions without annotations.
90
91 jaccard : float
92 J, see `Jaccard Index or Similarity
93 <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds
94 arithmetically to ``tp/(tp+fp+fn)``. In the special case where
95 ``tn+fp+fn == 0``, this function returns zero for the Jaccard index.
96 The Jaccard index depends on a TP-only numerator, similarly to the F1
97 score. For regions where there are no annotations, the Jaccard index
98 will always be zero, irrespective of the model output. Accuracy may be
99 a better proxy if one needs to consider the true abscence of
100 annotations in a region as part of the measure.
101
102 f1_score : float
103 F1, see `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It
104 corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.
105 In the special case where ``P+R == (2*tp+fp+fn) == 0``, this function
106 returns zero for the Jaccard index. The F1 or Dice score depends on a
107 TP-only numerator, similarly to the Jaccard index. For regions where
108 there are no annotations, the F1-score will always be zero,
109 irrespective of the model output. Accuracy may be a better proxy if
110 one needs to consider the true abscence of annotations in a region as
111 part of the measure.
112
113 """
114
115 return (
116 tricky_division(tp, tp + fp), #precision
117 tricky_division(tp, tp + fn), #recall
118 tricky_division(tn, fp + tn), #specificity
119 tricky_division(tp + tn, tp + fp + fn + tn), #accuracy
120 tricky_division(tp, tp + fp + fn), #jaccard index
121 tricky_division(2*tp, (2*tp) + fp + fn), #f1-score
122 )
123
124def beta_credible_region(k, l, lambda_, coverage):
125 """
126 Returns the mode, upper and lower bounds of the equal-tailed credible
127 region of a probability estimate following Bernoulli trials.
128
129 This implemetnation is based on [GOUTTE-2005]_. It assumes :math:`k`
130 successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
131 from a series of Bernoulli trials (likelihood is binomial). The posterior
132 is derivated using the Bayes Theorem with a beta prior. As there is no
133 reason to favour high vs. low precision, we use a symmetric Beta prior
134 (:math:`\\alpha=\\beta`):
135
136 .. math::
137
138 P(p|k,n) &= \\frac{P(k,n|p)P(p)}{P(k,n)} \\\\
139 P(p|k,n) &= \\frac{\\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\\\
140 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\beta)}p^{k+\\alpha-1}(1-p)^{n-k+\\beta-1} \\\\
141 P(p|k,n) &= \\frac{1}{B(k+\\alpha, n-k+\\alpha)}p^{k+\\alpha-1}(1-p)^{n-k+\\alpha-1}
142
143 The mode for this posterior (also the maximum a posteriori) is:
144
145 .. math::
146
147 \\text{mode}(p) = \\frac{k+\\lambda-1}{n+2\\lambda-2}
148
149 Concretely, the prior may be flat (all rates are equally likely,
150 :math:`\\lambda=1`) or we may use Jeoffrey's prior
151 (:math:`\\lambda=0.5`), that is invariant through re-parameterisation.
152 Jeffrey's prior indicate that rates close to zero or one are more likely.
153
154 The mode above works if :math:`k+{\\alpha},n-k+{\\alpha} > 1`, which is
155 usually the case for a resonably well tunned system, with more than a few
156 samples for analysis. In the limit of the system performance, :math:`k`
157 may be 0, which will make the mode become zero.
158
159 For our purposes, it may be more suitable to represent :math:`n = k + l`,
160 with :math:`k`, the number of successes and :math:`l`, the number of
161 failures in the binomial experiment, and find this more suitable
162 representation:
163
164 .. math::
165
166 P(p|k,l) &= \\frac{1}{B(k+\\alpha, l+\\alpha)}p^{k+\\alpha-1}(1-p)^{l+\\alpha-1} \\\\
167 \\text{mode}(p) &= \\frac{k+\\lambda-1}{k+l+2\\lambda-2}
168
169 This can be mapped to most rates calculated in the context of binary
170 classification this way:
171
172 * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
173 * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
174 * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
175 * F1-score: f1 = 2TP/(2TP+FP+FN), so k=2TP, l=FP+FN
176 * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
177 * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN
178
179 Contrary to frequentist approaches, in which one can only
180 say that if the test were repeated an infinite number of times,
181 and one constructed a confidence interval each time, then X%
182 of the confidence intervals would contain the true rate, here
183 we can say that given our observed data, there is a X% probability
184 that the true value of :math:`k/n` falls within the provided
185 interval.
186
187
188 .. note::
189
190 For a disambiguation with Confidence Interval, read
191 https://en.wikipedia.org/wiki/Credible_interval.
192
193
194 Parameters
195 ==========
196
197 k : int
198 Number of successes observed on the experiment
199
200 l : int
201 Number of failures observed on the experiment
202
203 lambda__ : :py:class:`float`, Optional
204 The parameterisation of the Beta prior to consider. Use
205 :math:`\\lambda=1` for a flat prior. Use :math:`\\lambda=0.5` for
206 Jeffrey's prior (the default).
207
208 coverage : :py:class:`float`, Optional
209 A floating-point number between 0 and 1.0 indicating the
210 coverage you're expecting. A value of 0.95 will ensure 95%
211 of the area under the probability density of the posterior
212 is covered by the returned equal-tailed interval.
213
214
215 Returns
216 =======
217
218 mean : float
219 The mean of the posterior distribution
220
221 mode : float
222 The mode of the posterior distribution
223
224 lower, upper: float
225 The lower and upper bounds of the credible region
226
227 """
228
229 # we return the equally-tailed range
230 right = (1.0-coverage)/2 #half-width in each side
231 lower = scipy.special.betaincinv(k+lambda_, l+lambda_, right)
232 upper = scipy.special.betaincinv(k+lambda_, l+lambda_, 1.0-right)
233
234 # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
235 alpha = k+lambda_
236 beta = l+lambda_
237
238 E = alpha / (alpha + beta)
239
240 # the mode of a beta distribution is a bit tricky
241 if alpha > 1 and beta > 1:
242 mode = (alpha-1) / (alpha+beta-2)
243 elif alpha == 1 and beta == 1:
244 # In the case of precision, if the threshold is close to 1.0, both TP
245 # and FP can be zero, which may cause this condition to be reached, if
246 # the prior is exactly 1 (flat prior). This is a weird situation,
247 # because effectively we are trying to compute the posterior when the
248 # total number of experiments is zero. So, only the prior counts - but
249 # the prior is flat, so we should just pick a value. We choose the
250 # middle of the range.
251 mode = 0.0 #any value would do, we just pick this one
252 elif alpha <= 1 and beta > 1:
253 mode = 0.0
254 elif alpha > 1 and beta <= 1:
255 mode = 1.0
256 else: #elif alpha < 1 and beta < 1:
257 # in the case of precision, if the threshold is close to 1.0, both TP
258 # and FP can be zero, which may cause this condition to be reached, if
259 # the prior is smaller than 1. This is a weird situation, because
260 # effectively we are trying to compute the posterior when the total
261 # number of experiments is zero. So, only the prior counts - but the
262 # prior is bimodal, so we should just pick a value. We choose the
263 # left of the range.
264 mode = 0.0 #could also be 1.0 as the prior is bimodal
265
266 return E, mode, lower, upper
267
268
269def bayesian_measures(tp, fp, tn, fn, lambda_, coverage):
270 """Calculates mean and mode from true/false positive and negative counts
271 with credible regions
272
273 This function can return bayesian estimates of standard machine learning
274 measures from true and false positive counts of positives and negatives.
275 For a thorough look into these and alternate names for the returned values,
276 please check Wikipedia's entry on `Precision and Recall
277 <https://en.wikipedia.org/wiki/Precision_and_recall>`_. See
278 :py:func:`beta_credible_region` for details on the calculation of returned
279 values.
280
281
282 Parameters
283 ----------
284
285 tp : int
286 True positive count, AKA "hit"
287
288 fp : int
289 False positive count, AKA "false alarm", or "Type I error"
290
291 tn : int
292 True negative count, AKA "correct rejection"
293
294 fn : int
295 False Negative count, AKA "miss", or "Type II error"
296
297 lambda_ : float
298 The parameterisation of the Beta prior to consider. Use
299 :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
300 Jeffrey's prior.
301
302 coverage : float
303 A floating-point number between 0 and 1.0 indicating the
304 coverage you're expecting. A value of 0.95 will ensure 95%
305 of the area under the probability density of the posterior
306 is covered by the returned equal-tailed interval.
307
308
309
310 Returns
311 -------
312
313 precision : (float, float, float, float)
314 P, AKA positive predictive value (PPV), mean, mode and credible
315 intervals (95% CI). It corresponds arithmetically
316 to ``tp/(tp+fp)``.
317
318 recall : (float, float, float, float)
319 R, AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode
320 and credible intervals (95% CI). It corresponds arithmetically to
321 ``tp/(tp+fn)``.
322
323 specificity : (float, float, float, float)
324 S, AKA selectivity or true negative rate (TNR), mean, mode and credible
325 intervals (95% CI). It corresponds arithmetically to ``tn/(tn+fp)``.
326
327 accuracy : (float, float, float, float)
328 A, mean, mode and credible intervals (95% CI). See `Accuracy
329 <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is
330 the proportion of correct predictions (both true positives and true
331 negatives) among the total number of pixels examined. It corresponds
332 arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes
333 both true-negatives and positives in the numerator, what makes it
334 sensitive to data or regions without annotations.
335
336 jaccard : (float, float, float, float)
337 J, mean, mode and credible intervals (95% CI). See `Jaccard Index or
338 Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It
339 corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index
340 depends on a TP-only numerator, similarly to the F1 score. For regions
341 where there are no annotations, the Jaccard index will always be zero,
342 irrespective of the model output. Accuracy may be a better proxy if
343 one needs to consider the true abscence of annotations in a region as
344 part of the measure.
345
346 f1_score : (float, float, float, float)
347 F1, mean, mode and credible intervals (95% CI). See `F1-score
348 <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds
349 arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or
350 Dice score depends on a TP-only numerator, similarly to the Jaccard
351 index. For regions where there are no annotations, the F1-score will
352 always be zero, irrespective of the model output. Accuracy may be a
353 better proxy if one needs to consider the true abscence of annotations
354 in a region as part of the measure.
355 """
356
357 return (
358 beta_credible_region(tp, fp, lambda_, coverage), #precision
359 beta_credible_region(tp, fn, lambda_, coverage), #recall
360 beta_credible_region(tn, fp, lambda_, coverage), #specificity
361 beta_credible_region(tp+tn, fp+fn, lambda_, coverage), #accuracy
362 beta_credible_region(tp, fp+fn, lambda_, coverage), #jaccard index
363 beta_credible_region(2*tp, fp+fn, lambda_, coverage), #f1-score
364 )
365
366def get_centered_maxf1(f1_scores, thresholds):
367 """Return the centered max F1 score threshold when multiple threshold
368 give the same max F1 score
369
370
371 Parameters
372 ----------
373
374 f1_scores : numpy.ndarray
375 1D array of f1 scores
376
377 thresholds : numpy.ndarray
378 1D array of thresholds
379
380 Returns
381 -------
382
383 max F1 score: float
384
385 threshold: float
386
387 """
388 maxf1 = f1_scores.max()
389 maxf1_indices = numpy.where(f1_scores == maxf1)[0]
390
391 # If multiple thresholds give the same max F1 score
392 if len(maxf1_indices) > 1:
393 mean_maxf1_index = int(round(numpy.mean(maxf1_indices)))
394 else:
395 mean_maxf1_index = maxf1_indices[0]
396
397 return maxf1, thresholds[mean_maxf1_index]