github.com/jgbaldwinbrown/perf@v0.1.1/pkg/stats/udist.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package stats 6 7 import ( 8 "math" 9 ) 10 11 // A UDist is the discrete probability distribution of the 12 // Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. 13 // 14 // The details of computing this distribution with no ties can be 15 // found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of 16 // Whether one of Two Random Variables is Stochastically Larger than 17 // the Other". Annals of Mathematical Statistics 18 (1): 50–60. 18 // Computing this distribution in the presence of ties is described in 19 // Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". 20 // Journal of the American Statistical Association 61 (315): 772-787 21 // and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney 22 // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: 23 // 805-813 (the former paper contains details that are glossed over in 24 // the latter paper but has mathematical typesetting issues, so it's 25 // easiest to get the context from the former paper and the details 26 // from the latter). 27 type UDist struct { 28 N1, N2 int 29 30 // T is the count of the number of ties at each rank in the 31 // input distributions. T may be nil, in which case it is 32 // assumed there are no ties (which is equivalent to an M+N 33 // slice of 1s). It must be the case that Sum(T) == M+N. 34 T []int 35 } 36 37 // hasTies returns true if d has any tied samples. 38 func (d UDist) hasTies() bool { 39 for _, t := range d.T { 40 if t > 1 { 41 return true 42 } 43 } 44 return false 45 } 46 47 // p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 48 // for values of U from 0 up to and including the U argument. 49 // 50 // This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite 51 // fast for small values of N1 and N2. However, it does not handle ties. 52 func (d UDist) p(U int) []float64 { 53 // This is a dynamic programming implementation of the 54 // recursive recurrence definition given by Mann and Whitney: 55 // 56 // p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) 57 // p_{n,m}(U) = 0 if U < 0 58 // p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 59 // = 0 if U > 0 60 // 61 // (Note that there is a typo in the original paper. The first 62 // recursive application of p should be for U-m, not U-M.) 63 // 64 // Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only 65 // need to store one "plane" of the three dimensional space at 66 // a time. 67 // 68 // Furthermore, p_{n,m} = p_{m,n}, so we only construct values 69 // for n <= m and obtain the rest through symmetry. 70 // 71 // We organize the computed values of p as followed: 72 // 73 // n → N 74 // m * 75 // ↓ * * 76 // * * * 77 // * * * * 78 // * * * * 79 // M * * * * 80 // 81 // where each * is a slice indexed by U. The code below 82 // computes these left-to-right, top-to-bottom, so it only 83 // stores one row of this matrix at a time. Furthermore, 84 // computing an element in a given U slice only depends on the 85 // same and smaller values of U, so we can overwrite the U 86 // slice we're computing in place as long as we start with the 87 // largest value of U. Finally, even though the recurrence 88 // depends on (n,m) above the diagonal and we use symmetry to 89 // mirror those across the diagonal to (m,n), the mirrored 90 // indexes are always available in the current row, so this 91 // mirroring does not interfere with our ability to recycle 92 // state. 93 94 N, M := d.N1, d.N2 95 if N > M { 96 N, M = M, N 97 } 98 99 memo := make([][]float64, N+1) 100 for n := range memo { 101 memo[n] = make([]float64, U+1) 102 } 103 104 for m := 0; m <= M; m++ { 105 // Compute p_{0,m}. This is zero except for U=0. 106 memo[0][0] = 1 107 108 // Compute the remainder of this row. 109 nlim := N 110 if m < nlim { 111 nlim = m 112 } 113 for n := 1; n <= nlim; n++ { 114 lp := memo[n-1] // p_{n-1,m} 115 var rp []float64 116 if n <= m-1 { 117 rp = memo[n] // p_{n,m-1} 118 } else { 119 rp = memo[m-1] // p{m-1,n} and m==n 120 } 121 122 // For a given n,m, U is at most n*m. 123 // 124 // TODO: Actually, it's at most ⌈n*m/2⌉, but 125 // then we need to use more complex symmetries 126 // in the inner loop below. 127 ulim := n * m 128 if U < ulim { 129 ulim = U 130 } 131 132 out := memo[n] // p_{n,m} 133 nplusm := float64(n + m) 134 for U1 := ulim; U1 >= 0; U1-- { 135 l := 0.0 136 if U1-m >= 0 { 137 l = float64(n) * lp[U1-m] 138 } 139 r := float64(m) * rp[U1] 140 out[U1] = (l + r) / nplusm 141 } 142 } 143 } 144 return memo[N] 145 } 146 147 type ukey struct { 148 n1 int // size of first sample 149 twoU int // 2*U statistic for this permutation 150 } 151 152 // This computes the cumulative counts of the Mann-Whitney U 153 // distribution in the presence of ties using the computation from 154 // Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney 155 // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: 156 // 805-813, with much guidance from appendix L of Klotz, A 157 // Computational Approach to Statistics. 158 // 159 // makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the 160 // number of ranks (up to len(t)), n1 is the size of the first sample 161 // (up to the n1 argument), and U is the U statistic (up to the 162 // argument twoU/2). The value of an entry in the memo table is the 163 // number of permutations of a sample of size n1 in a ranking with tie 164 // vector t[:K] having a U statistic <= U. 165 func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { 166 // Another candidate for a fast implementation is van de Wiel, 167 // "The split-up algorithm: a fast symbolic method for 168 // computing p-values of distribution-free statistics". This 169 // is what's used by R's coin package. It's a comparatively 170 // recent publication, so it's presumably faster (or perhaps 171 // just more general) than previous techniques, but I can't 172 // get my hands on the paper. 173 // 174 // TODO: ~40% of this function's time is spent in mapassign on 175 // the assignment lines in the two loops and another ~20% in 176 // map access and iteration. Improving map behavior or 177 // replacing the maps altogether with some other constant-time 178 // structure could double performance. 179 // 180 // TODO: The worst case for this function is when there are 181 // few ties. Yet the best case overall is when there are *no* 182 // ties. Can we get the best of both worlds? Use the fast 183 // algorithm for the most part when there are few ties and mix 184 // in the general algorithm just where we need it? That's 185 // certainly possible for sub-problems where t[:k] has no 186 // ties, but that doesn't help if t[0] has a tie but nothing 187 // else does. Is it possible to rearrange the ranks without 188 // messing up our computation of the U statistic for 189 // sub-problems? 190 191 K := len(t) 192 193 // Compute a coefficients. The a slice is indexed by k (a[0] 194 // is unused). 195 a := make([]int, K+1) 196 a[1] = t[0] 197 for k := 2; k <= K; k++ { 198 a[k] = a[k-1] + t[k-2] + t[k-1] 199 } 200 201 // Create the memo table for the counts function, A. The A 202 // slice is indexed by k (A[0] is unused). 203 // 204 // In "The Mann Whitney Distribution Using Linked Lists", they 205 // use linked lists (*gasp*) for this, but within each K it's 206 // really just a memoization table, so it's faster to use a 207 // map. The outer structure is a slice indexed by k because we 208 // need to find all memo entries with certain values of k. 209 // 210 // TODO: The n1 and twoU values in the ukeys follow strict 211 // patterns. For each K value, the n1 values are every integer 212 // between two bounds. For each (K, n1) value, the twoU values 213 // are every integer multiple of a certain base between two 214 // bounds. It might be worth turning these into directly 215 // indexible slices. 216 A := make([]map[ukey]float64, K+1) 217 A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} 218 219 // Compute memo table (k, n1, twoU) triples from high K values 220 // to low K values. This drives the recurrence relation 221 // downward to figure out all of the needed argument triples. 222 // 223 // TODO: Is it possible to generate this table bottom-up? If 224 // so, this could be a pure dynamic programming algorithm and 225 // we could discard the K dimension. We could at least store 226 // the inputs in a more compact representation that replaces 227 // the twoU dimension with an interval and a step size (as 228 // suggested by Cheung, Klotz, not that they make it at all 229 // clear *why* they're suggesting this). 230 tsum := sumint(t) // always ∑ t[0:k] 231 for k := K - 1; k >= 2; k-- { 232 tsum -= t[k] 233 A[k] = make(map[ukey]float64) 234 235 // Construct A[k] from A[k+1]. 236 for A_kplus1 := range A[k+1] { 237 rkLow := maxint(0, A_kplus1.n1-tsum) 238 rkHigh := minint(A_kplus1.n1, t[k]) 239 for rk := rkLow; rk <= rkHigh; rk++ { 240 twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) 241 n1_k := A_kplus1.n1 - rk 242 if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { 243 key := ukey{n1: n1_k, twoU: twoU_k} 244 A[k][key] = 0 245 } 246 } 247 } 248 } 249 250 // Fill counts in memo table from low K values to high K 251 // values. This unwinds the recurrence relation. 252 253 // Start with K==2 base case. 254 // 255 // TODO: Later computations depend on these, but these don't 256 // depend on anything (including each other), so if K==2, we 257 // can skip the memo table altogether. 258 if K < 2 { 259 panic("K < 2") 260 } 261 N_2 := t[0] + t[1] 262 for A_2i := range A[2] { 263 Asum := 0.0 264 r2Low := maxint(0, A_2i.n1-t[0]) 265 r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 266 for r2 := r2Low; r2 <= r2High; r2++ { 267 Asum += mathChoose(t[0], A_2i.n1-r2) * 268 mathChoose(t[1], r2) 269 } 270 A[2][A_2i] = Asum 271 } 272 273 // Derive counts for the rest of the memo table. 274 tsum = t[0] // always ∑ t[0:k-1] 275 for k := 3; k <= K; k++ { 276 tsum += t[k-2] 277 278 // Compute A[k] counts from A[k-1] counts. 279 for A_ki := range A[k] { 280 Asum := 0.0 281 rkLow := maxint(0, A_ki.n1-tsum) 282 rkHigh := minint(A_ki.n1, t[k-1]) 283 for rk := rkLow; rk <= rkHigh; rk++ { 284 twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) 285 n1_kminus1 := A_ki.n1 - rk 286 x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] 287 if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { 288 x = mathChoose(tsum, n1_kminus1) 289 } 290 Asum += x * mathChoose(t[k-1], rk) 291 } 292 A[k][A_ki] = Asum 293 } 294 } 295 296 return A 297 } 298 299 func twoUmin(n1 int, t, a []int) int { 300 K := len(t) 301 twoU := -n1 * n1 302 n1_k := n1 303 for k := 1; k <= K; k++ { 304 twoU_k := minint(n1_k, t[k-1]) 305 twoU += twoU_k * a[k] 306 n1_k -= twoU_k 307 } 308 return twoU 309 } 310 311 func twoUmax(n1 int, t, a []int) int { 312 K := len(t) 313 twoU := -n1 * n1 314 n1_k := n1 315 for k := K; k > 0; k-- { 316 twoU_k := minint(n1_k, t[k-1]) 317 twoU += twoU_k * a[k] 318 n1_k -= twoU_k 319 } 320 return twoU 321 } 322 323 func (d UDist) PMF(U float64) float64 { 324 if U < 0 || U >= 0.5+float64(d.N1*d.N2) { 325 return 0 326 } 327 328 if d.hasTies() { 329 // makeUmemo computes the CDF directly. Take its 330 // difference to get the PMF. 331 p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] 332 p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] 333 if !ok1 || !ok2 { 334 panic("makeUmemo did not return expected memoization table") 335 } 336 return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) 337 } 338 339 // There are no ties. Use the fast algorithm. U must be integral. 340 Ui := int(math.Floor(U)) 341 // TODO: Use symmetry to minimize U 342 return d.p(Ui)[Ui] 343 } 344 345 func (d UDist) CDF(U float64) float64 { 346 if U < 0 { 347 return 0 348 } else if U >= float64(d.N1*d.N2) { 349 return 1 350 } 351 352 if d.hasTies() { 353 // TODO: Minimize U? 354 p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] 355 if !ok { 356 panic("makeUmemo did not return expected memoization table") 357 } 358 return p / mathChoose(d.N1+d.N2, d.N1) 359 } 360 361 // There are no ties. Use the fast algorithm. U must be integral. 362 Ui := int(math.Floor(U)) 363 // The distribution is symmetric around U = m * n / 2. Sum up 364 // whichever tail is smaller. 365 flip := Ui >= (d.N1*d.N2+1)/2 366 if flip { 367 Ui = d.N1*d.N2 - Ui - 1 368 } 369 pdfs := d.p(Ui) 370 p := 0.0 371 for _, pdf := range pdfs[:Ui+1] { 372 p += pdf 373 } 374 if flip { 375 p = 1 - p 376 } 377 return p 378 } 379 380 func (d UDist) Step() float64 { 381 return 0.5 382 } 383 384 func (d UDist) Bounds() (float64, float64) { 385 // TODO: More precise bounds when there are ties. 386 return 0, float64(d.N1 * d.N2) 387 }