github.com/jgbaldwinbrown/perf@v0.1.1/pkg/stats/sample.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package stats
     6  
     7  import (
     8  	"math"
     9  	"sort"
    10  )
    11  
    12  // Sample is a collection of possibly weighted data points.
    13  type Sample struct {
    14  	// Xs is the slice of sample values.
    15  	Xs []float64
    16  
    17  	// Weights[i] is the weight of sample Xs[i].  If Weights is
    18  	// nil, all Xs have weight 1.  Weights must have the same
    19  	// length of Xs and all values must be non-negative.
    20  	Weights []float64
    21  
    22  	// Sorted indicates that Xs is sorted in ascending order.
    23  	Sorted bool
    24  }
    25  
    26  // Bounds returns the minimum and maximum values of xs.
    27  func Bounds(xs []float64) (min float64, max float64) {
    28  	if len(xs) == 0 {
    29  		return math.NaN(), math.NaN()
    30  	}
    31  	min, max = xs[0], xs[0]
    32  	for _, x := range xs {
    33  		if x < min {
    34  			min = x
    35  		}
    36  		if x > max {
    37  			max = x
    38  		}
    39  	}
    40  	return
    41  }
    42  
    43  // Bounds returns the minimum and maximum values of the Sample.
    44  //
    45  // If the Sample is weighted, this ignores samples with zero weight.
    46  //
    47  // This is constant time if s.Sorted and there are no zero-weighted
    48  // values.
    49  func (s Sample) Bounds() (min float64, max float64) {
    50  	if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) {
    51  		return Bounds(s.Xs)
    52  	}
    53  
    54  	if s.Sorted {
    55  		if s.Weights == nil {
    56  			return s.Xs[0], s.Xs[len(s.Xs)-1]
    57  		}
    58  		min, max = math.NaN(), math.NaN()
    59  		for i, w := range s.Weights {
    60  			if w != 0 {
    61  				min = s.Xs[i]
    62  				break
    63  			}
    64  		}
    65  		if math.IsNaN(min) {
    66  			return
    67  		}
    68  		for i := range s.Weights {
    69  			if s.Weights[len(s.Weights)-i-1] != 0 {
    70  				max = s.Xs[len(s.Weights)-i-1]
    71  				break
    72  			}
    73  		}
    74  	} else {
    75  		min, max = math.Inf(1), math.Inf(-1)
    76  		for i, x := range s.Xs {
    77  			w := s.Weights[i]
    78  			if x < min && w != 0 {
    79  				min = x
    80  			}
    81  			if x > max && w != 0 {
    82  				max = x
    83  			}
    84  		}
    85  		if math.IsInf(min, 0) {
    86  			min, max = math.NaN(), math.NaN()
    87  		}
    88  	}
    89  	return
    90  }
    91  
    92  // vecSum returns the sum of xs.
    93  func vecSum(xs []float64) float64 {
    94  	sum := 0.0
    95  	for _, x := range xs {
    96  		sum += x
    97  	}
    98  	return sum
    99  }
   100  
   101  // Sum returns the (possibly weighted) sum of the Sample.
   102  func (s Sample) Sum() float64 {
   103  	if s.Weights == nil {
   104  		return vecSum(s.Xs)
   105  	}
   106  	sum := 0.0
   107  	for i, x := range s.Xs {
   108  		sum += x * s.Weights[i]
   109  	}
   110  	return sum
   111  }
   112  
   113  // Weight returns the total weight of the Sasmple.
   114  func (s Sample) Weight() float64 {
   115  	if s.Weights == nil {
   116  		return float64(len(s.Xs))
   117  	}
   118  	return vecSum(s.Weights)
   119  }
   120  
   121  // Mean returns the arithmetic mean of xs.
   122  func Mean(xs []float64) float64 {
   123  	if len(xs) == 0 {
   124  		return math.NaN()
   125  	}
   126  	m := 0.0
   127  	for i, x := range xs {
   128  		m += (x - m) / float64(i+1)
   129  	}
   130  	return m
   131  }
   132  
   133  // Mean returns the arithmetic mean of the Sample.
   134  func (s Sample) Mean() float64 {
   135  	if len(s.Xs) == 0 || s.Weights == nil {
   136  		return Mean(s.Xs)
   137  	}
   138  
   139  	m, wsum := 0.0, 0.0
   140  	for i, x := range s.Xs {
   141  		// Use weighted incremental mean:
   142  		//   m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i
   143  		//       = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i)
   144  		w := s.Weights[i]
   145  		wsum += w
   146  		m += (x - m) * w / wsum
   147  	}
   148  	return m
   149  }
   150  
   151  // GeoMean returns the geometric mean of xs. xs must be positive.
   152  func GeoMean(xs []float64) float64 {
   153  	if len(xs) == 0 {
   154  		return math.NaN()
   155  	}
   156  	m := 0.0
   157  	for i, x := range xs {
   158  		if x <= 0 {
   159  			return math.NaN()
   160  		}
   161  		lx := math.Log(x)
   162  		m += (lx - m) / float64(i+1)
   163  	}
   164  	return math.Exp(m)
   165  }
   166  
   167  // GeoMean returns the geometric mean of the Sample. All samples
   168  // values must be positive.
   169  func (s Sample) GeoMean() float64 {
   170  	if len(s.Xs) == 0 || s.Weights == nil {
   171  		return GeoMean(s.Xs)
   172  	}
   173  
   174  	m, wsum := 0.0, 0.0
   175  	for i, x := range s.Xs {
   176  		w := s.Weights[i]
   177  		wsum += w
   178  		lx := math.Log(x)
   179  		m += (lx - m) * w / wsum
   180  	}
   181  	return math.Exp(m)
   182  }
   183  
   184  // Variance returns the sample variance of xs.
   185  func Variance(xs []float64) float64 {
   186  	if len(xs) == 0 {
   187  		return math.NaN()
   188  	} else if len(xs) <= 1 {
   189  		return 0
   190  	}
   191  
   192  	// Based on Wikipedia's presentation of Welford 1962
   193  	// (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm).
   194  	// This is more numerically stable than the standard two-pass
   195  	// formula and not prone to massive cancellation.
   196  	mean, M2 := 0.0, 0.0
   197  	for n, x := range xs {
   198  		delta := x - mean
   199  		mean += delta / float64(n+1)
   200  		M2 += delta * (x - mean)
   201  	}
   202  	return M2 / float64(len(xs)-1)
   203  }
   204  
   205  func (s Sample) Variance() float64 {
   206  	if len(s.Xs) == 0 || s.Weights == nil {
   207  		return Variance(s.Xs)
   208  	}
   209  	// TODO(austin)
   210  	panic("Weighted Variance not implemented")
   211  }
   212  
   213  // StdDev returns the sample standard deviation of xs.
   214  func StdDev(xs []float64) float64 {
   215  	return math.Sqrt(Variance(xs))
   216  }
   217  
   218  // StdDev returns the sample standard deviation of the Sample.
   219  func (s Sample) StdDev() float64 {
   220  	if len(s.Xs) == 0 || s.Weights == nil {
   221  		return StdDev(s.Xs)
   222  	}
   223  	// TODO(austin)
   224  	panic("Weighted StdDev not implemented")
   225  }
   226  
   227  // Percentile returns the pctileth value from the Sample. This uses
   228  // interpolation method R8 from Hyndman and Fan (1996).
   229  //
   230  // pctile will be capped to the range [0, 1]. If len(xs) == 0 or all
   231  // weights are 0, returns NaN.
   232  //
   233  // Percentile(0.5) is the median. Percentile(0.25) and
   234  // Percentile(0.75) are the first and third quartiles, respectively.
   235  //
   236  // This is constant time if s.Sorted and s.Weights == nil.
   237  func (s Sample) Percentile(pctile float64) float64 {
   238  	if len(s.Xs) == 0 {
   239  		return math.NaN()
   240  	} else if pctile <= 0 {
   241  		min, _ := s.Bounds()
   242  		return min
   243  	} else if pctile >= 1 {
   244  		_, max := s.Bounds()
   245  		return max
   246  	}
   247  
   248  	if !s.Sorted {
   249  		// TODO(austin) Use select algorithm instead
   250  		s = *s.Copy().Sort()
   251  	}
   252  
   253  	if s.Weights == nil {
   254  		N := float64(len(s.Xs))
   255  		//n := pctile * (N + 1) // R6
   256  		n := 1/3.0 + pctile*(N+1/3.0) // R8
   257  		kf, frac := math.Modf(n)
   258  		k := int(kf)
   259  		if k <= 0 {
   260  			return s.Xs[0]
   261  		} else if k >= len(s.Xs) {
   262  			return s.Xs[len(s.Xs)-1]
   263  		}
   264  		return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1])
   265  	} else {
   266  		// TODO(austin): Implement interpolation
   267  
   268  		target := s.Weight() * pctile
   269  
   270  		// TODO(austin) If we had cumulative weights, we could
   271  		// do this in log time.
   272  		for i, weight := range s.Weights {
   273  			target -= weight
   274  			if target < 0 {
   275  				return s.Xs[i]
   276  			}
   277  		}
   278  		return s.Xs[len(s.Xs)-1]
   279  	}
   280  }
   281  
   282  // IQR returns the interquartile range of the Sample.
   283  //
   284  // This is constant time if s.Sorted and s.Weights == nil.
   285  func (s Sample) IQR() float64 {
   286  	if !s.Sorted {
   287  		s = *s.Copy().Sort()
   288  	}
   289  	return s.Percentile(0.75) - s.Percentile(0.25)
   290  }
   291  
   292  type sampleSorter struct {
   293  	xs      []float64
   294  	weights []float64
   295  }
   296  
   297  func (p *sampleSorter) Len() int {
   298  	return len(p.xs)
   299  }
   300  
   301  func (p *sampleSorter) Less(i, j int) bool {
   302  	return p.xs[i] < p.xs[j]
   303  }
   304  
   305  func (p *sampleSorter) Swap(i, j int) {
   306  	p.xs[i], p.xs[j] = p.xs[j], p.xs[i]
   307  	p.weights[i], p.weights[j] = p.weights[j], p.weights[i]
   308  }
   309  
   310  // Sort sorts the samples in place in s and returns s.
   311  //
   312  // A sorted sample improves the performance of some algorithms.
   313  func (s *Sample) Sort() *Sample {
   314  	if s.Sorted || sort.Float64sAreSorted(s.Xs) {
   315  		// All set
   316  	} else if s.Weights == nil {
   317  		sort.Float64s(s.Xs)
   318  	} else {
   319  		sort.Sort(&sampleSorter{s.Xs, s.Weights})
   320  	}
   321  	s.Sorted = true
   322  	return s
   323  }
   324  
   325  // Copy returns a copy of the Sample.
   326  //
   327  // The returned Sample shares no data with the original, so they can
   328  // be modified (for example, sorted) independently.
   329  func (s Sample) Copy() *Sample {
   330  	xs := make([]float64, len(s.Xs))
   331  	copy(xs, s.Xs)
   332  
   333  	weights := []float64(nil)
   334  	if s.Weights != nil {
   335  		weights = make([]float64, len(s.Weights))
   336  		copy(weights, s.Weights)
   337  	}
   338  
   339  	return &Sample{xs, weights, s.Sorted}
   340  }