Skip to content

Commit b5ec6ea

Browse files
AndreasMadsenevanlucas
authored andcommittedJan 30, 2018
benchmark: make compare.R easier to understand
PR-URL: #18373 Reviewed-By: Joyee Cheung <[email protected]> Reviewed-By: James M Snell <[email protected]> Reviewed-By: Ruben Bridgewater <[email protected]>
1 parent 7b73e70 commit b5ec6ea

File tree

1 file changed

+49
-12
lines changed

1 file changed

+49
-12
lines changed
 

‎benchmark/compare.R

+49-12
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ if (!is.null(plot.filename)) {
3535
ggsave(plot.filename, p);
3636
}
3737

38+
# computes the shared standard error, as used in the welch t-test
39+
welch.sd = function (old.rate, new.rate) {
40+
old.se.squared = var(old.rate) / length(old.rate)
41+
new.se.squared = var(new.rate) / length(new.rate)
42+
return(sqrt(old.se.squared + new.se.squared))
43+
}
44+
45+
# calculate the improvement confidence interval. The improvement is calculated
46+
# by dividing by old.mu and not new.mu, because old.mu is what the mean
47+
# improvement is calculated relative to.
48+
confidence.interval = function (shared.se, old.mu, w, risk) {
49+
interval = qt(1 - (risk / 2), w$parameter) * shared.se;
50+
return(sprintf("±%.2f%%", (interval / old.mu) * 100))
51+
}
52+
3853
# Print a table with results
3954
statistics = ddply(dat, "name", function(subdat) {
4055
old.rate = subset(subdat, binary == "old")$rate;
@@ -45,33 +60,42 @@ statistics = ddply(dat, "name", function(subdat) {
4560
new.mu = mean(new.rate);
4661
improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));
4762

48-
p.value = NA;
49-
confidence = 'NA';
63+
r = list(
64+
confidence = "NA",
65+
improvement = improvement,
66+
"accuracy (*)" = "NA",
67+
"(**)" = "NA",
68+
"(***)" = "NA"
69+
);
70+
5071
# Check if there is enough data to calculate the calculate the p-value
5172
if (length(old.rate) > 1 && length(new.rate) > 1) {
5273
# Perform a statistics test to see of there actually is a difference in
5374
# performance.
5475
w = t.test(rate ~ binary, data=subdat);
55-
p.value = w$p.value;
76+
shared.se = welch.sd(old.rate, new.rate)
5677

5778
# Add user friendly stars to the table. There should be at least one star
5879
# before you can say that there is an improvement.
5980
confidence = '';
60-
if (p.value < 0.001) {
81+
if (w$p.value < 0.001) {
6182
confidence = '***';
62-
} else if (p.value < 0.01) {
83+
} else if (w$p.value < 0.01) {
6384
confidence = '**';
64-
} else if (p.value < 0.05) {
85+
} else if (w$p.value < 0.05) {
6586
confidence = '*';
6687
}
88+
89+
r = list(
90+
confidence = confidence,
91+
improvement = improvement,
92+
"accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
93+
"(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
94+
"(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
95+
);
6796
}
6897

69-
r = list(
70-
improvement = improvement,
71-
confidence = confidence,
72-
p.value = p.value
73-
);
74-
return(data.frame(r));
98+
return(data.frame(r, check.names=FALSE));
7599
});
76100

77101

@@ -81,3 +105,16 @@ statistics$name = NULL;
81105

82106
options(width = 200);
83107
print(statistics);
108+
cat("\n")
109+
cat(sprintf(
110+
"Be aware that when doing many comparisions the risk of a false-positive
111+
result increases. In this case there are %d comparisions, you can thus
112+
expect the following amount of false-positive results:
113+
%.2f false positives, when considering a 5%% risk acceptance (*, **, ***),
114+
%.2f false positives, when considering a 1%% risk acceptance (**, ***),
115+
%.2f false positives, when considering a 0.1%% risk acceptance (***)
116+
",
117+
nrow(statistics),
118+
nrow(statistics) * 0.05,
119+
nrow(statistics) * 0.01,
120+
nrow(statistics) * 0.001))

0 commit comments

Comments
 (0)
Please sign in to comment.