vitess.io/vitess@v0.16.2/go/mathstats/sample.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package mathstats 6 7 import ( 8 "math" 9 "sort" 10 ) 11 12 // Sample is a collection of possibly weighted data points. 13 type Sample struct { 14 // Xs is the slice of sample values. 15 Xs []float64 16 17 // Sorted indicates that Xs is sorted in ascending order. 18 Sorted bool 19 } 20 21 // Bounds returns the minimum and maximum values of xs. 22 func Bounds(xs []float64) (min float64, max float64) { 23 if len(xs) == 0 { 24 return math.NaN(), math.NaN() 25 } 26 min, max = xs[0], xs[0] 27 for _, x := range xs { 28 if x < min { 29 min = x 30 } 31 if x > max { 32 max = x 33 } 34 } 35 return 36 } 37 38 // Bounds returns the minimum and maximum values of the Sample. 39 // 40 // If the Sample is weighted, this ignores samples with zero weight. 41 // 42 // This is constant time if s.Sorted and there are no zero-weighted 43 // values. 44 func (s Sample) Bounds() (min float64, max float64) { 45 if len(s.Xs) == 0 || !s.Sorted { 46 return Bounds(s.Xs) 47 } 48 return s.Xs[0], s.Xs[len(s.Xs)-1] 49 } 50 51 // vecSum returns the sum of xs. 52 func vecSum(xs []float64) float64 { 53 sum := 0.0 54 for _, x := range xs { 55 sum += x 56 } 57 return sum 58 } 59 60 // Sum returns the (possibly weighted) sum of the Sample. 61 func (s Sample) Sum() float64 { 62 return vecSum(s.Xs) 63 } 64 65 // Weight returns the total weight of the Sasmple. 66 func (s Sample) Weight() float64 { 67 return float64(len(s.Xs)) 68 } 69 70 // Mean returns the arithmetic mean of xs. 71 func Mean(xs []float64) float64 { 72 if len(xs) == 0 { 73 return math.NaN() 74 } 75 m := 0.0 76 for i, x := range xs { 77 m += (x - m) / float64(i+1) 78 } 79 return m 80 } 81 82 // Mean returns the arithmetic mean of the Sample. 83 func (s Sample) Mean() float64 { 84 return Mean(s.Xs) 85 } 86 87 // GeoMean returns the geometric mean of xs. xs must be positive. 88 func GeoMean(xs []float64) float64 { 89 if len(xs) == 0 { 90 return math.NaN() 91 } 92 m := 0.0 93 for i, x := range xs { 94 if x <= 0 { 95 return math.NaN() 96 } 97 lx := math.Log(x) 98 m += (lx - m) / float64(i+1) 99 } 100 return math.Exp(m) 101 } 102 103 // GeoMean returns the geometric mean of the Sample. All samples 104 // values must be positive. 105 func (s Sample) GeoMean() float64 { 106 return GeoMean(s.Xs) 107 } 108 109 // Variance returns the sample variance of xs. 110 func Variance(xs []float64) float64 { 111 if len(xs) == 0 { 112 return math.NaN() 113 } else if len(xs) <= 1 { 114 return 0 115 } 116 117 // Based on Wikipedia's presentation of Welford 1962 118 // (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm). 119 // This is more numerically stable than the standard two-pass 120 // formula and not prone to massive cancellation. 121 mean, M2 := 0.0, 0.0 122 for n, x := range xs { 123 delta := x - mean 124 mean += delta / float64(n+1) 125 M2 += delta * (x - mean) 126 } 127 return M2 / float64(len(xs)-1) 128 } 129 130 // Variance returns the variance of xs 131 func (s Sample) Variance() float64 { 132 return Variance(s.Xs) 133 } 134 135 // StdDev returns the sample standard deviation of xs. 136 func StdDev(xs []float64) float64 { 137 return math.Sqrt(Variance(xs)) 138 } 139 140 // StdDev returns the sample standard deviation of the Sample. 141 func (s Sample) StdDev() float64 { 142 return StdDev(s.Xs) 143 } 144 145 // Percentile returns the pctileth value from the Sample. This uses 146 // interpolation method R8 from Hyndman and Fan (1996). 147 // 148 // pctile will be capped to the range [0, 1]. If len(xs) == 0 or all 149 // weights are 0, returns NaN. 150 // 151 // Percentile(0.5) is the median. Percentile(0.25) and 152 // Percentile(0.75) are the first and third quartiles, respectively. 153 // 154 // This is constant time if s.Sorted and s.Weights == nil. 155 func (s *Sample) Percentile(pctile float64) float64 { 156 if len(s.Xs) == 0 { 157 return math.NaN() 158 } else if pctile <= 0 { 159 min, _ := s.Bounds() 160 return min 161 } else if pctile >= 1 { 162 _, max := s.Bounds() 163 return max 164 } 165 166 if !s.Sorted { 167 s.Sort() 168 } 169 170 N := float64(len(s.Xs)) 171 //n := pctile * (N + 1) // R6 172 n := 1/3.0 + pctile*(N+1/3.0) // R8 173 kf, frac := math.Modf(n) 174 k := int(kf) 175 if k <= 0 { 176 return s.Xs[0] 177 } else if k >= len(s.Xs) { 178 return s.Xs[len(s.Xs)-1] 179 } 180 return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1]) 181 } 182 183 // IQR returns the interquartile range of the Sample. 184 // 185 // This is constant time if s.Sorted and s.Weights == nil. 186 func (s Sample) IQR() float64 { 187 if !s.Sorted { 188 s = *s.Copy().Sort() 189 } 190 return s.Percentile(0.75) - s.Percentile(0.25) 191 } 192 193 // Sort sorts the samples in place in s and returns s. 194 // 195 // A sorted sample improves the performance of some algorithms. 196 func (s *Sample) Sort() *Sample { 197 if s.Sorted || sort.Float64sAreSorted(s.Xs) { 198 // All set 199 } else { 200 sort.Float64s(s.Xs) 201 } 202 s.Sorted = true 203 return s 204 } 205 206 // Copy returns a copy of the Sample. 207 // 208 // The returned Sample shares no data with the original, so they can 209 // be modified (for example, sorted) independently. 210 func (s Sample) Copy() *Sample { 211 xs := make([]float64, len(s.Xs)) 212 copy(xs, s.Xs) 213 return &Sample{xs, s.Sorted} 214 } 215 216 // FilterOutliers updates this sample in-place by removing all the values that are outliers 217 func (s *Sample) FilterOutliers() { 218 // Discard outliers. 219 q1, q3 := s.Percentile(0.25), s.Percentile(0.75) 220 lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1) 221 nn := 0 222 for _, value := range s.Xs { 223 if lo <= value && value <= hi { 224 s.Xs[nn] = value 225 nn++ 226 } 227 } 228 s.Xs = s.Xs[:nn] 229 } 230 231 // Clear resets this sample so it contains 0 values 232 func (s *Sample) Clear() { 233 s.Xs = s.Xs[:0] 234 s.Sorted = false 235 }