github.com/jgbaldwinbrown/perf@v0.1.1/pkg/stats/udist.go

github.com/jgbaldwinbrown/perf@v0.1.1/pkg/stats/udist.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package stats
     6  
     7  import (
     8  	"math"
     9  )
    10  
    11  // A UDist is the discrete probability distribution of the
    12  // Mann-Whitney U statistic for a pair of samples of sizes N1 and N2.
    13  //
    14  // The details of computing this distribution with no ties can be
    15  // found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
    16  // Whether one of Two Random Variables is Stochastically Larger than
    17  // the Other". Annals of Mathematical Statistics 18 (1): 50–60.
    18  // Computing this distribution in the presence of ties is described in
    19  // Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
    20  // Journal of the American Statistical Association 61 (315): 772-787
    21  // and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
    22  // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
    23  // 805-813 (the former paper contains details that are glossed over in
    24  // the latter paper but has mathematical typesetting issues, so it's
    25  // easiest to get the context from the former paper and the details
    26  // from the latter).
    27  type UDist struct {
    28  	N1, N2 int
    29  
    30  	// T is the count of the number of ties at each rank in the
    31  	// input distributions. T may be nil, in which case it is
    32  	// assumed there are no ties (which is equivalent to an M+N
    33  	// slice of 1s). It must be the case that Sum(T) == M+N.
    34  	T []int
    35  }
    36  
    37  // hasTies returns true if d has any tied samples.
    38  func (d UDist) hasTies() bool {
    39  	for _, t := range d.T {
    40  		if t > 1 {
    41  			return true
    42  		}
    43  	}
    44  	return false
    45  }
    46  
    47  // p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947
    48  // for values of U from 0 up to and including the U argument.
    49  //
    50  // This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite
    51  // fast for small values of N1 and N2. However, it does not handle ties.
    52  func (d UDist) p(U int) []float64 {
    53  	// This is a dynamic programming implementation of the
    54  	// recursive recurrence definition given by Mann and Whitney:
    55  	//
    56  	//   p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m)
    57  	//   p_{n,m}(U) = 0                           if U < 0
    58  	//   p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0
    59  	//                          = 0               if U > 0
    60  	//
    61  	// (Note that there is a typo in the original paper. The first
    62  	// recursive application of p should be for U-m, not U-M.)
    63  	//
    64  	// Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only
    65  	// need to store one "plane" of the three dimensional space at
    66  	// a time.
    67  	//
    68  	// Furthermore, p_{n,m} = p_{m,n}, so we only construct values
    69  	// for n <= m and obtain the rest through symmetry.
    70  	//
    71  	// We organize the computed values of p as followed:
    72  	//
    73  	//       n →   N
    74  	//     m *
    75  	//     ↓ * *
    76  	//       * * *
    77  	//       * * * *
    78  	//       * * * *
    79  	//     M * * * *
    80  	//
    81  	// where each * is a slice indexed by U. The code below
    82  	// computes these left-to-right, top-to-bottom, so it only
    83  	// stores one row of this matrix at a time. Furthermore,
    84  	// computing an element in a given U slice only depends on the
    85  	// same and smaller values of U, so we can overwrite the U
    86  	// slice we're computing in place as long as we start with the
    87  	// largest value of U. Finally, even though the recurrence
    88  	// depends on (n,m) above the diagonal and we use symmetry to
    89  	// mirror those across the diagonal to (m,n), the mirrored
    90  	// indexes are always available in the current row, so this
    91  	// mirroring does not interfere with our ability to recycle
    92  	// state.
    93  
    94  	N, M := d.N1, d.N2
    95  	if N > M {
    96  		N, M = M, N
    97  	}
    98  
    99  	memo := make([][]float64, N+1)
   100  	for n := range memo {
   101  		memo[n] = make([]float64, U+1)
   102  	}
   103  
   104  	for m := 0; m <= M; m++ {
   105  		// Compute p_{0,m}. This is zero except for U=0.
   106  		memo[0][0] = 1
   107  
   108  		// Compute the remainder of this row.
   109  		nlim := N
   110  		if m < nlim {
   111  			nlim = m
   112  		}
   113  		for n := 1; n <= nlim; n++ {
   114  			lp := memo[n-1] // p_{n-1,m}
   115  			var rp []float64
   116  			if n <= m-1 {
   117  				rp = memo[n] // p_{n,m-1}
   118  			} else {
   119  				rp = memo[m-1] // p{m-1,n} and m==n
   120  			}
   121  
   122  			// For a given n,m, U is at most n*m.
   123  			//
   124  			// TODO: Actually, it's at most ⌈n*m/2⌉, but
   125  			// then we need to use more complex symmetries
   126  			// in the inner loop below.
   127  			ulim := n * m
   128  			if U < ulim {
   129  				ulim = U
   130  			}
   131  
   132  			out := memo[n] // p_{n,m}
   133  			nplusm := float64(n + m)
   134  			for U1 := ulim; U1 >= 0; U1-- {
   135  				l := 0.0
   136  				if U1-m >= 0 {
   137  					l = float64(n) * lp[U1-m]
   138  				}
   139  				r := float64(m) * rp[U1]
   140  				out[U1] = (l + r) / nplusm
   141  			}
   142  		}
   143  	}
   144  	return memo[N]
   145  }
   146  
   147  type ukey struct {
   148  	n1   int // size of first sample
   149  	twoU int // 2*U statistic for this permutation
   150  }
   151  
   152  // This computes the cumulative counts of the Mann-Whitney U
   153  // distribution in the presence of ties using the computation from
   154  // Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
   155  // Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
   156  // 805-813, with much guidance from appendix L of Klotz, A
   157  // Computational Approach to Statistics.
   158  //
   159  // makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the
   160  // number of ranks (up to len(t)), n1 is the size of the first sample
   161  // (up to the n1 argument), and U is the U statistic (up to the
   162  // argument twoU/2). The value of an entry in the memo table is the
   163  // number of permutations of a sample of size n1 in a ranking with tie
   164  // vector t[:K] having a U statistic <= U.
   165  func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 {
   166  	// Another candidate for a fast implementation is van de Wiel,
   167  	// "The split-up algorithm: a fast symbolic method for
   168  	// computing p-values of distribution-free statistics". This
   169  	// is what's used by R's coin package. It's a comparatively
   170  	// recent publication, so it's presumably faster (or perhaps
   171  	// just more general) than previous techniques, but I can't
   172  	// get my hands on the paper.
   173  	//
   174  	// TODO: ~40% of this function's time is spent in mapassign on
   175  	// the assignment lines in the two loops and another ~20% in
   176  	// map access and iteration. Improving map behavior or
   177  	// replacing the maps altogether with some other constant-time
   178  	// structure could double performance.
   179  	//
   180  	// TODO: The worst case for this function is when there are
   181  	// few ties. Yet the best case overall is when there are *no*
   182  	// ties. Can we get the best of both worlds? Use the fast
   183  	// algorithm for the most part when there are few ties and mix
   184  	// in the general algorithm just where we need it? That's
   185  	// certainly possible for sub-problems where t[:k] has no
   186  	// ties, but that doesn't help if t[0] has a tie but nothing
   187  	// else does. Is it possible to rearrange the ranks without
   188  	// messing up our computation of the U statistic for
   189  	// sub-problems?
   190  
   191  	K := len(t)
   192  
   193  	// Compute a coefficients. The a slice is indexed by k (a[0]
   194  	// is unused).
   195  	a := make([]int, K+1)
   196  	a[1] = t[0]
   197  	for k := 2; k <= K; k++ {
   198  		a[k] = a[k-1] + t[k-2] + t[k-1]
   199  	}
   200  
   201  	// Create the memo table for the counts function, A. The A
   202  	// slice is indexed by k (A[0] is unused).
   203  	//
   204  	// In "The Mann Whitney Distribution Using Linked Lists", they
   205  	// use linked lists (*gasp*) for this, but within each K it's
   206  	// really just a memoization table, so it's faster to use a
   207  	// map. The outer structure is a slice indexed by k because we
   208  	// need to find all memo entries with certain values of k.
   209  	//
   210  	// TODO: The n1 and twoU values in the ukeys follow strict
   211  	// patterns. For each K value, the n1 values are every integer
   212  	// between two bounds. For each (K, n1) value, the twoU values
   213  	// are every integer multiple of a certain base between two
   214  	// bounds. It might be worth turning these into directly
   215  	// indexible slices.
   216  	A := make([]map[ukey]float64, K+1)
   217  	A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0}
   218  
   219  	// Compute memo table (k, n1, twoU) triples from high K values
   220  	// to low K values. This drives the recurrence relation
   221  	// downward to figure out all of the needed argument triples.
   222  	//
   223  	// TODO: Is it possible to generate this table bottom-up? If
   224  	// so, this could be a pure dynamic programming algorithm and
   225  	// we could discard the K dimension. We could at least store
   226  	// the inputs in a more compact representation that replaces
   227  	// the twoU dimension with an interval and a step size (as
   228  	// suggested by Cheung, Klotz, not that they make it at all
   229  	// clear *why* they're suggesting this).
   230  	tsum := sumint(t) // always ∑ t[0:k]
   231  	for k := K - 1; k >= 2; k-- {
   232  		tsum -= t[k]
   233  		A[k] = make(map[ukey]float64)
   234  
   235  		// Construct A[k] from A[k+1].
   236  		for A_kplus1 := range A[k+1] {
   237  			rkLow := maxint(0, A_kplus1.n1-tsum)
   238  			rkHigh := minint(A_kplus1.n1, t[k])
   239  			for rk := rkLow; rk <= rkHigh; rk++ {
   240  				twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk)
   241  				n1_k := A_kplus1.n1 - rk
   242  				if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) {
   243  					key := ukey{n1: n1_k, twoU: twoU_k}
   244  					A[k][key] = 0
   245  				}
   246  			}
   247  		}
   248  	}
   249  
   250  	// Fill counts in memo table from low K values to high K
   251  	// values. This unwinds the recurrence relation.
   252  
   253  	// Start with K==2 base case.
   254  	//
   255  	// TODO: Later computations depend on these, but these don't
   256  	// depend on anything (including each other), so if K==2, we
   257  	// can skip the memo table altogether.
   258  	if K < 2 {
   259  		panic("K < 2")
   260  	}
   261  	N_2 := t[0] + t[1]
   262  	for A_2i := range A[2] {
   263  		Asum := 0.0
   264  		r2Low := maxint(0, A_2i.n1-t[0])
   265  		r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2
   266  		for r2 := r2Low; r2 <= r2High; r2++ {
   267  			Asum += mathChoose(t[0], A_2i.n1-r2) *
   268  				mathChoose(t[1], r2)
   269  		}
   270  		A[2][A_2i] = Asum
   271  	}
   272  
   273  	// Derive counts for the rest of the memo table.
   274  	tsum = t[0] // always ∑ t[0:k-1]
   275  	for k := 3; k <= K; k++ {
   276  		tsum += t[k-2]
   277  
   278  		// Compute A[k] counts from A[k-1] counts.
   279  		for A_ki := range A[k] {
   280  			Asum := 0.0
   281  			rkLow := maxint(0, A_ki.n1-tsum)
   282  			rkHigh := minint(A_ki.n1, t[k-1])
   283  			for rk := rkLow; rk <= rkHigh; rk++ {
   284  				twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk)
   285  				n1_kminus1 := A_ki.n1 - rk
   286  				x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}]
   287  				if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 {
   288  					x = mathChoose(tsum, n1_kminus1)
   289  				}
   290  				Asum += x * mathChoose(t[k-1], rk)
   291  			}
   292  			A[k][A_ki] = Asum
   293  		}
   294  	}
   295  
   296  	return A
   297  }
   298  
   299  func twoUmin(n1 int, t, a []int) int {
   300  	K := len(t)
   301  	twoU := -n1 * n1
   302  	n1_k := n1
   303  	for k := 1; k <= K; k++ {
   304  		twoU_k := minint(n1_k, t[k-1])
   305  		twoU += twoU_k * a[k]
   306  		n1_k -= twoU_k
   307  	}
   308  	return twoU
   309  }
   310  
   311  func twoUmax(n1 int, t, a []int) int {
   312  	K := len(t)
   313  	twoU := -n1 * n1
   314  	n1_k := n1
   315  	for k := K; k > 0; k-- {
   316  		twoU_k := minint(n1_k, t[k-1])
   317  		twoU += twoU_k * a[k]
   318  		n1_k -= twoU_k
   319  	}
   320  	return twoU
   321  }
   322  
   323  func (d UDist) PMF(U float64) float64 {
   324  	if U < 0 || U >= 0.5+float64(d.N1*d.N2) {
   325  		return 0
   326  	}
   327  
   328  	if d.hasTies() {
   329  		// makeUmemo computes the CDF directly. Take its
   330  		// difference to get the PMF.
   331  		p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}]
   332  		p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
   333  		if !ok1 || !ok2 {
   334  			panic("makeUmemo did not return expected memoization table")
   335  		}
   336  		return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1)
   337  	}
   338  
   339  	// There are no ties. Use the fast algorithm. U must be integral.
   340  	Ui := int(math.Floor(U))
   341  	// TODO: Use symmetry to minimize U
   342  	return d.p(Ui)[Ui]
   343  }
   344  
   345  func (d UDist) CDF(U float64) float64 {
   346  	if U < 0 {
   347  		return 0
   348  	} else if U >= float64(d.N1*d.N2) {
   349  		return 1
   350  	}
   351  
   352  	if d.hasTies() {
   353  		// TODO: Minimize U?
   354  		p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
   355  		if !ok {
   356  			panic("makeUmemo did not return expected memoization table")
   357  		}
   358  		return p / mathChoose(d.N1+d.N2, d.N1)
   359  	}
   360  
   361  	// There are no ties. Use the fast algorithm. U must be integral.
   362  	Ui := int(math.Floor(U))
   363  	// The distribution is symmetric around U = m * n / 2. Sum up
   364  	// whichever tail is smaller.
   365  	flip := Ui >= (d.N1*d.N2+1)/2
   366  	if flip {
   367  		Ui = d.N1*d.N2 - Ui - 1
   368  	}
   369  	pdfs := d.p(Ui)
   370  	p := 0.0
   371  	for _, pdf := range pdfs[:Ui+1] {
   372  		p += pdf
   373  	}
   374  	if flip {
   375  		p = 1 - p
   376  	}
   377  	return p
   378  }
   379  
   380  func (d UDist) Step() float64 {
   381  	return 0.5
   382  }
   383  
   384  func (d UDist) Bounds() (float64, float64) {
   385  	// TODO: More precise bounds when there are ties.
   386  	return 0, float64(d.N1 * d.N2)
   387  }