@@ -35,6 +35,21 @@ if (!is.null(plot.filename)) {
35
35
ggsave(plot.filename , p );
36
36
}
37
37
38
+ # computes the shared standard error, as used in the welch t-test
39
+ welch.sd = function (old.rate , new.rate ) {
40
+ old.se.squared = var(old.rate ) / length(old.rate )
41
+ new.se.squared = var(new.rate ) / length(new.rate )
42
+ return (sqrt(old.se.squared + new.se.squared ))
43
+ }
44
+
45
+ # calculate the improvement confidence interval. The improvement is calculated
46
+ # by dividing by old.mu and not new.mu, because old.mu is what the mean
47
+ # improvement is calculated relative to.
48
+ confidence.interval = function (shared.se , old.mu , w , risk ) {
49
+ interval = qt(1 - (risk / 2 ), w $ parameter ) * shared.se ;
50
+ return (sprintf(" ±%.2f%%" , (interval / old.mu ) * 100 ))
51
+ }
52
+
38
53
# Print a table with results
39
54
statistics = ddply(dat , " name" , function (subdat ) {
40
55
old.rate = subset(subdat , binary == " old" )$ rate ;
@@ -45,33 +60,42 @@ statistics = ddply(dat, "name", function(subdat) {
45
60
new.mu = mean(new.rate );
46
61
improvement = sprintf(" %.2f %%" , ((new.mu - old.mu ) / old.mu * 100 ));
47
62
48
- p.value = NA ;
49
- confidence = ' NA' ;
63
+ r = list (
64
+ confidence = " NA" ,
65
+ improvement = improvement ,
66
+ " accuracy (*)" = " NA" ,
67
+ " (**)" = " NA" ,
68
+ " (***)" = " NA"
69
+ );
70
+
50
71
# Check if there is enough data to calculate the calculate the p-value
51
72
if (length(old.rate ) > 1 && length(new.rate ) > 1 ) {
52
73
# Perform a statistics test to see of there actually is a difference in
53
74
# performance.
54
75
w = t.test(rate ~ binary , data = subdat );
55
- p.value = w $ p.value ;
76
+ shared.se = welch.sd( old.rate , new.rate )
56
77
57
78
# Add user friendly stars to the table. There should be at least one star
58
79
# before you can say that there is an improvement.
59
80
confidence = ' ' ;
60
- if (p.value < 0.001 ) {
81
+ if (w $ p.value < 0.001 ) {
61
82
confidence = ' ***' ;
62
- } else if (p.value < 0.01 ) {
83
+ } else if (w $ p.value < 0.01 ) {
63
84
confidence = ' **' ;
64
- } else if (p.value < 0.05 ) {
85
+ } else if (w $ p.value < 0.05 ) {
65
86
confidence = ' *' ;
66
87
}
88
+
89
+ r = list (
90
+ confidence = confidence ,
91
+ improvement = improvement ,
92
+ " accuracy (*)" = confidence.interval(shared.se , old.mu , w , 0.05 ),
93
+ " (**)" = confidence.interval(shared.se , old.mu , w , 0.01 ),
94
+ " (***)" = confidence.interval(shared.se , old.mu , w , 0.001 )
95
+ );
67
96
}
68
97
69
- r = list (
70
- improvement = improvement ,
71
- confidence = confidence ,
72
- p.value = p.value
73
- );
74
- return (data.frame (r ));
98
+ return (data.frame (r , check.names = FALSE ));
75
99
});
76
100
77
101
@@ -81,3 +105,16 @@ statistics$name = NULL;
81
105
82
106
options(width = 200 );
83
107
print(statistics );
108
+ cat(" \n " )
109
+ cat(sprintf(
110
+ " Be aware that when doing many comparisions the risk of a false-positive
111
+ result increases. In this case there are %d comparisions, you can thus
112
+ expect the following amount of false-positive results:
113
+ %.2f false positives, when considering a 5%% risk acceptance (*, **, ***),
114
+ %.2f false positives, when considering a 1%% risk acceptance (**, ***),
115
+ %.2f false positives, when considering a 0.1%% risk acceptance (***)
116
+ " ,
117
+ nrow(statistics ),
118
+ nrow(statistics ) * 0.05 ,
119
+ nrow(statistics ) * 0.01 ,
120
+ nrow(statistics ) * 0.001 ))
0 commit comments