-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbootstrap.js
More file actions
101 lines (89 loc) · 3.35 KB
/
bootstrap.js
File metadata and controls
101 lines (89 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/**
* bootstrap.js
* Bootstrap resampling for applicability score confidence intervals
* and pairwise model comparison, as described in the paper.
*
* All inference is performed at the QUERY level to respect the
* natural independence structure of retrieval evaluation (§2).
*/
/**
* Draw one bootstrap sample mean from an array of per-query scores.
*
* @param {Array<number>} scores Per-query metric scores
* @returns {number} Mean of one bootstrap replicate
*/
function bootstrapReplicateMean(scores) {
const n = scores.length;
let sum = 0;
for (let i = 0; i < n; i++) {
sum += scores[Math.floor(Math.random() * n)];
}
return sum / n;
}
/**
* Compute a bootstrap confidence interval for a single model's
* applicability score S(f, T) — Equations (1)–(2) and (6)–(7).
*
* @param {Array<number>} perQueryScores Per-query metric scores m(qi; f)
* @param {number} B Number of bootstrap replicates
* @param {number} alpha Significance level (default 0.05 → 95% CI)
* @returns {{ mean: number, lo: number, hi: number, replicates: number[] }}
*/
function bootstrapCI(perQueryScores, B = 1000, alpha = 0.05) {
const n = perQueryScores.length;
const mean = perQueryScores.reduce((s, v) => s + v, 0) / n;
const replicates = [];
for (let b = 0; b < B; b++) {
replicates.push(bootstrapReplicateMean(perQueryScores));
}
replicates.sort((a, b) => a - b);
const lo = replicates[Math.floor((alpha / 2) * B)];
const hi = replicates[Math.floor((1 - alpha / 2) * B)];
return { mean, lo, hi, replicates };
}
/**
* Paired bootstrap comparison between two models — Equations (3)–(8).
*
* Both models are evaluated on the same queries so we bootstrap the
* per-query *differences* directly. This controls for query difficulty
* and yields tighter intervals than an unpaired approach (Eq. 8).
*
* @param {Array<number>} scoresA Per-query scores for model A
* @param {Array<number>} scoresB Per-query scores for model B
* @param {number} B Number of bootstrap replicates
* @param {number} alpha Significance level (default 0.05)
* @returns {{
* meanDiff: number,
* lo: number,
* hi: number,
* significant: boolean,
* winner: 'A' | 'B' | 'tie',
* replicates: number[]
* }}
*/
function pairedBootstrapComparison(scoresA, scoresB, B = 1000, alpha = 0.05) {
if (scoresA.length !== scoresB.length) {
throw new Error('Score arrays must be the same length (same query set).');
}
// Per-query differences Δ(qi) = ma(qi) − mb(qi) [Eq. 3]
const diffs = scoresA.map((a, i) => a - scoresB[i]);
const n = diffs.length;
// Estimated mean difference [Eq. 5]
const meanDiff = diffs.reduce((s, v) => s + v, 0) / n;
// Bootstrap replicates of Δ̂* [Eq. 6]
const replicates = [];
for (let b = 0; b < B; b++) {
let sum = 0;
for (let i = 0; i < n; i++) sum += diffs[Math.floor(Math.random() * n)];
replicates.push(sum / n);
}
replicates.sort((a, b) => a - b);
// 95% CI [Eq. 7]
const lo = replicates[Math.floor((alpha / 2) * B)];
const hi = replicates[Math.floor((1 - alpha / 2) * B)];
// CI excludes zero → one model is significantly better
const significant = lo > 0 || hi < 0;
let winner = 'tie';
if (significant) winner = meanDiff > 0 ? 'A' : 'B';
return { meanDiff, lo, hi, significant, winner, replicates };
}