import numpy as np import sklearn.metrics as sk recall_level_default = 0.95 def calib_err(confidence, correct, p='2', beta=100): # beta is target bin size idxs = np.argsort(confidence) confidence = confidence[idxs] correct = correct[idxs] bins = [[i * beta, (i + 1) * beta] for i in range(len(confidence) // beta)] bins[-1] = [bins[-1][0], len(confidence)] cerr = 0 total_examples = len(confidence) for i in range(len(bins) - 1): bin_confidence = confidence[bins[i][0]:bins[i][1]] bin_correct = correct[bins[i][0]:bins[i][1]] num_examples_in_bin = len(bin_confidence) if num_examples_in_bin > 0: difference = np.abs(np.nanmean(bin_confidence) - np.nanmean(bin_correct)) if p == '2': cerr += num_examples_in_bin / total_examples * np.square(difference) elif p == '1': cerr += num_examples_in_bin / total_examples * difference elif p == 'infty' or p == 'infinity' or p == 'max': cerr = np.maximum(cerr, difference) else: assert False, "p must be '1', '2', or 'infty'" if p == '2': cerr = np.sqrt(cerr) return cerr def aurra(confidence, correct): conf_ranks = np.argsort(confidence)[::-1] # indices from greatest to least confidence rra_curve = np.cumsum(np.asarray(correct)[conf_ranks]) rra_curve = rra_curve / np.arange(1, len(rra_curve) + 1) # accuracy at each response rate return np.mean(rra_curve) def soft_f1(confidence, correct): wrong = 1 - correct # # the incorrectly classified samples are our interest # # so they make the positive class # tp_soft = np.sum((1 - confidence) * wrong) # fp_soft = np.sum((1 - confidence) * correct) # fn_soft = np.sum(confidence * wrong) # return 2 * tp_soft / (2 * tp_soft + fn_soft + fp_soft) return 2 * ((1 - confidence) * wrong).sum()/(1 - confidence + wrong).sum() def tune_temp(logits, labels, binary_search=True, lower=0.2, upper=5.0, eps=0.0001): logits = np.array(logits) if binary_search: import torch import torch.nn.functional as F logits = torch.FloatTensor(logits) labels = torch.LongTensor(labels) t_guess = torch.FloatTensor([0.5*(lower + upper)]).requires_grad_() while upper - lower > eps: if torch.autograd.grad(F.cross_entropy(logits / t_guess, labels), t_guess)[0] > 0: upper = 0.5 * (lower + upper) else: lower = 0.5 * (lower + upper) t_guess = t_guess * 0 + 0.5 * (lower + upper) t = min([lower, 0.5 * (lower + upper), upper], key=lambda x: float(F.cross_entropy(logits / x, labels))) else: import cvxpy as cx set_size = np.array(logits).shape[0] t = cx.Variable() expr = sum((cx.Minimize(cx.log_sum_exp(logits[i, :] * t) - logits[i, labels[i]] * t) for i in range(set_size))) p = cx.Problem(expr, [lower <= t, t <= upper]) p.solve() # p.solve(solver=cx.SCS) t = 1 / t.value return t def print_measures(rms, aurra_metric, mad, sf1, method_name='Baseline'): print('\t\t\t\t\t\t\t' + method_name) print('RMS Calib Error (%): \t\t{:.2f}'.format(100 * rms)) print('AURRA (%): \t\t\t{:.2f}'.format(100 * aurra)) # print('MAD Calib Error (%): \t\t{:.2f}'.format(100 * mad)) # print('Soft F1 Score (%): \t\t{:.2f}'.format(100 * sf1)) def show_calibration_results(confidence, correct, method_name='Baseline'): print('\t\t\t\t' + method_name) print('RMS Calib Error (%): \t\t{:.2f}'.format( 100 * calib_err(confidence, correct, p='2'))) print('AURRA (%): \t\t\t{:.2f}'.format( 100 * aurra(confidence, correct))) # print('MAD Calib Error (%): \t\t{:.2f}'.format( # 100 * calib_err(confidence, correct, p='1'))) # print('Soft F1-Score (%): \t\t{:.2f}'.format( # 100 * soft_f1(confidence, correct))) def fpr_and_fdr_at_recall(y_true, y_score, recall_level=recall_level_default, pos_label=None): classes = np.unique(y_true) if (pos_label is None and not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1]))): raise ValueError("Data is not binary and pos_label is not specified") elif pos_label is None: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true)[threshold_idxs] fps = 1 + threshold_idxs - tps # add one because of zero-based indexing thresholds = y_score[threshold_idxs] recall = tps / tps[-1] last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) # [last_ind::-1] recall, fps, tps, thresholds = np.r_[recall[sl], 1], np.r_[fps[sl], 0], np.r_[tps[sl], 0], thresholds[sl] cutoff = np.argmin(np.abs(recall - recall_level)) return fps[cutoff] / (np.sum(np.logical_not(y_true))) # , fps[cutoff]/(fps[cutoff] + tps[cutoff]) def get_measures(_pos, _neg, recall_level=recall_level_default): pos = np.array(_pos[:]).reshape((-1, 1)) neg = np.array(_neg[:]).reshape((-1, 1)) examples = np.squeeze(np.vstack((pos, neg))) labels = np.zeros(len(examples), dtype=np.int32) labels[:len(pos)] += 1 auroc = sk.roc_auc_score(labels, examples) aupr = sk.average_precision_score(labels, examples) fpr = fpr_and_fdr_at_recall(labels, examples, recall_level) return auroc, aupr, fpr def print_measures_old(auroc, aupr, fpr, method_name='Ours', recall_level=recall_level_default): print('\t\t\t' + method_name) print('FPR{:d}:\t{:.2f}'.format(int(100 * recall_level), 100 * fpr)) print('AUROC: \t{:.2f}'.format(100 * auroc)) print('AUPR: \t{:.2f}'.format(100 * aupr)) def print_measures_with_std(aurocs, auprs, fprs, method_name='Ours', recall_level=recall_level_default): print('\t\t\t' + method_name) print('FPR{:d}:\t{:.2f}\t+/- {:.2f}'.format(int(100 * recall_level), 100 * np.mean(fprs), 100 * np.std(fprs))) print('AUROC: \t{:.2f}\t+/- {:.2f}'.format(100 * np.mean(aurocs), 100 * np.std(aurocs))) print('AUPR: \t{:.2f}\t+/- {:.2f}'.format(100 * np.mean(auprs), 100 * np.std(auprs))) def get_and_print_results(out_score, in_score, num_to_avg=1): aurocs, auprs, fprs = [], [], [] #for _ in range(num_to_avg): # out_score = get_ood_scores(ood_loader) measures = get_measures(out_score, in_score) aurocs.append(measures[0]); auprs.append(measures[1]); fprs.append(measures[2]) auroc = np.mean(aurocs); aupr = np.mean(auprs); fpr = np.mean(fprs) #auroc_list.append(auroc); aupr_list.append(aupr); fpr_list.append(fpr) #if num_to_avg >= 5: # print_measures_with_std(aurocs, auprs, fprs, method_name='Ours') #else: # print_measures(auroc, aupr, fpr, method_name='Ours') return auroc, aupr, fpr