gonum.org/v1/gonum@v0.14.0/stat/stat.go (about)

     1  // Copyright ©2014 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package stat
     6  
     7  import (
     8  	"math"
     9  	"sort"
    10  
    11  	"gonum.org/v1/gonum/floats"
    12  )
    13  
    14  // CumulantKind specifies the behavior for calculating the empirical CDF or Quantile
    15  type CumulantKind int
    16  
    17  // List of supported CumulantKind values for the Quantile function.
    18  // Constant values should match the R nomenclature. See
    19  // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population
    20  const (
    21  	// Empirical treats the distribution as the actual empirical distribution.
    22  	Empirical CumulantKind = 1
    23  	// LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation.
    24  	LinInterp CumulantKind = 4
    25  )
    26  
    27  // bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by:
    28  //
    29  //	\sum_i \sqrt{p_i q_i}
    30  //
    31  // It is assumed that p and q have equal length.
    32  func bhattacharyyaCoeff(p, q []float64) float64 {
    33  	var bc float64
    34  	for i, a := range p {
    35  		bc += math.Sqrt(a * q[i])
    36  	}
    37  	return bc
    38  }
    39  
    40  // Bhattacharyya computes the distance between the probability distributions p and q given by:
    41  //
    42  //	-\ln ( \sum_i \sqrt{p_i q_i} )
    43  //
    44  // The lengths of p and q must be equal. It is assumed that p and q sum to 1.
    45  func Bhattacharyya(p, q []float64) float64 {
    46  	if len(p) != len(q) {
    47  		panic("stat: slice length mismatch")
    48  	}
    49  	bc := bhattacharyyaCoeff(p, q)
    50  	return -math.Log(bc)
    51  }
    52  
    53  // CDF returns the empirical cumulative distribution function value of x, that is
    54  // the fraction of the samples less than or equal to q. The
    55  // exact behavior is determined by the CumulantKind. CDF is theoretically
    56  // the inverse of the Quantile function, though it may not be the actual inverse
    57  // for all values q and CumulantKinds.
    58  //
    59  // The x data must be sorted in increasing order. If weights is nil then all
    60  // of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
    61  // CDF will panic if the length of x is zero.
    62  //
    63  // CumulantKind behaviors:
    64  //   - Empirical: Returns the lowest fraction for which q is greater than or equal
    65  //     to that fraction of samples
    66  func CDF(q float64, c CumulantKind, x, weights []float64) float64 {
    67  	if weights != nil && len(x) != len(weights) {
    68  		panic("stat: slice length mismatch")
    69  	}
    70  	if floats.HasNaN(x) {
    71  		return math.NaN()
    72  	}
    73  	if len(x) == 0 {
    74  		panic("stat: zero length slice")
    75  	}
    76  	if !sort.Float64sAreSorted(x) {
    77  		panic("x data are not sorted")
    78  	}
    79  
    80  	if q < x[0] {
    81  		return 0
    82  	}
    83  	if q >= x[len(x)-1] {
    84  		return 1
    85  	}
    86  
    87  	var sumWeights float64
    88  	if weights == nil {
    89  		sumWeights = float64(len(x))
    90  	} else {
    91  		sumWeights = floats.Sum(weights)
    92  	}
    93  
    94  	// Calculate the index
    95  	switch c {
    96  	case Empirical:
    97  		// Find the smallest value that is greater than that percent of the samples
    98  		var w float64
    99  		for i, v := range x {
   100  			if v > q {
   101  				return w / sumWeights
   102  			}
   103  			if weights == nil {
   104  				w++
   105  			} else {
   106  				w += weights[i]
   107  			}
   108  		}
   109  		panic("impossible")
   110  	default:
   111  		panic("stat: bad cumulant kind")
   112  	}
   113  }
   114  
   115  // ChiSquare computes the chi-square distance between the observed frequencies 'obs' and
   116  // expected frequencies 'exp' given by:
   117  //
   118  //	\sum_i (obs_i-exp_i)^2 / exp_i
   119  //
   120  // The lengths of obs and exp must be equal.
   121  func ChiSquare(obs, exp []float64) float64 {
   122  	if len(obs) != len(exp) {
   123  		panic("stat: slice length mismatch")
   124  	}
   125  	var result float64
   126  	for i, a := range obs {
   127  		b := exp[i]
   128  		if a == 0 && b == 0 {
   129  			continue
   130  		}
   131  		result += (a - b) * (a - b) / b
   132  	}
   133  	return result
   134  }
   135  
   136  // CircularMean returns the circular mean of the dataset.
   137  //
   138  //	atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i))
   139  //
   140  // If weights is nil then all of the weights are 1. If weights is not nil, then
   141  // len(x) must equal len(weights).
   142  func CircularMean(x, weights []float64) float64 {
   143  	if weights != nil && len(x) != len(weights) {
   144  		panic("stat: slice length mismatch")
   145  	}
   146  
   147  	var aX, aY float64
   148  	if weights != nil {
   149  		for i, v := range x {
   150  			aX += weights[i] * math.Cos(v)
   151  			aY += weights[i] * math.Sin(v)
   152  		}
   153  	} else {
   154  		for _, v := range x {
   155  			aX += math.Cos(v)
   156  			aY += math.Sin(v)
   157  		}
   158  	}
   159  
   160  	return math.Atan2(aY, aX)
   161  }
   162  
   163  // Correlation returns the weighted correlation between the samples of x and y
   164  // with the given means.
   165  //
   166  //	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY)
   167  //
   168  // The lengths of x and y must be equal. If weights is nil then all of the
   169  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   170  func Correlation(x, y, weights []float64) float64 {
   171  	// This is a two-pass corrected implementation. It is an adaptation of the
   172  	// algorithm used in the MeanVariance function, which applies a correction
   173  	// to the typical two pass approach.
   174  
   175  	if len(x) != len(y) {
   176  		panic("stat: slice length mismatch")
   177  	}
   178  	xu := Mean(x, weights)
   179  	yu := Mean(y, weights)
   180  	var (
   181  		sxx           float64
   182  		syy           float64
   183  		sxy           float64
   184  		xcompensation float64
   185  		ycompensation float64
   186  	)
   187  	if weights == nil {
   188  		for i, xv := range x {
   189  			yv := y[i]
   190  			xd := xv - xu
   191  			yd := yv - yu
   192  			sxx += xd * xd
   193  			syy += yd * yd
   194  			sxy += xd * yd
   195  			xcompensation += xd
   196  			ycompensation += yd
   197  		}
   198  		// xcompensation and ycompensation are from Chan, et. al.
   199  		// referenced in the MeanVariance function. They are analogous
   200  		// to the second term in (1.7) in that paper.
   201  		sxx -= xcompensation * xcompensation / float64(len(x))
   202  		syy -= ycompensation * ycompensation / float64(len(x))
   203  
   204  		return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy)
   205  
   206  	}
   207  
   208  	var sumWeights float64
   209  	for i, xv := range x {
   210  		w := weights[i]
   211  		yv := y[i]
   212  		xd := xv - xu
   213  		wxd := w * xd
   214  		yd := yv - yu
   215  		wyd := w * yd
   216  		sxx += wxd * xd
   217  		syy += wyd * yd
   218  		sxy += wxd * yd
   219  		xcompensation += wxd
   220  		ycompensation += wyd
   221  		sumWeights += w
   222  	}
   223  	// xcompensation and ycompensation are from Chan, et. al.
   224  	// referenced in the MeanVariance function. They are analogous
   225  	// to the second term in (1.7) in that paper, except they use
   226  	// the sumWeights instead of the sample count.
   227  	sxx -= xcompensation * xcompensation / sumWeights
   228  	syy -= ycompensation * ycompensation / sumWeights
   229  
   230  	return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy)
   231  }
   232  
   233  // Kendall returns the weighted Tau-a Kendall correlation between the
   234  // samples of x and y. The Kendall correlation measures the quantity of
   235  // concordant and discordant pairs of numbers. If weights are specified then
   236  // each pair is weighted by weights[i] * weights[j] and the final sum is
   237  // normalized to stay between -1 and 1.
   238  // The lengths of x and y must be equal. If weights is nil then all of the
   239  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   240  func Kendall(x, y, weights []float64) float64 {
   241  	if len(x) != len(y) {
   242  		panic("stat: slice length mismatch")
   243  	}
   244  
   245  	var (
   246  		cc float64 // number of concordant pairs
   247  		dc float64 // number of discordant pairs
   248  		n  = len(x)
   249  	)
   250  
   251  	if weights == nil {
   252  		for i := 0; i < n; i++ {
   253  			for j := i; j < n; j++ {
   254  				if i == j {
   255  					continue
   256  				}
   257  				if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
   258  					cc++
   259  				} else {
   260  					dc++
   261  				}
   262  			}
   263  		}
   264  		return (cc - dc) / float64(n*(n-1)/2)
   265  	}
   266  
   267  	var sumWeights float64
   268  
   269  	for i := 0; i < n; i++ {
   270  		for j := i; j < n; j++ {
   271  			if i == j {
   272  				continue
   273  			}
   274  			weight := weights[i] * weights[j]
   275  			if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
   276  				cc += weight
   277  			} else {
   278  				dc += weight
   279  			}
   280  			sumWeights += weight
   281  		}
   282  	}
   283  	return float64(cc-dc) / sumWeights
   284  }
   285  
   286  // Covariance returns the weighted covariance between the samples of x and y.
   287  //
   288  //	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1)
   289  //
   290  // The lengths of x and y must be equal. If weights is nil then all of the
   291  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   292  func Covariance(x, y, weights []float64) float64 {
   293  	// This is a two-pass corrected implementation. It is an adaptation of the
   294  	// algorithm used in the MeanVariance function, which applies a correction
   295  	// to the typical two pass approach.
   296  
   297  	if len(x) != len(y) {
   298  		panic("stat: slice length mismatch")
   299  	}
   300  	xu := Mean(x, weights)
   301  	yu := Mean(y, weights)
   302  	return covarianceMeans(x, y, weights, xu, yu)
   303  }
   304  
   305  // covarianceMeans returns the weighted covariance between x and y with the mean
   306  // of x and y already specified. See the documentation of Covariance for more
   307  // information.
   308  func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 {
   309  	var (
   310  		ss            float64
   311  		xcompensation float64
   312  		ycompensation float64
   313  	)
   314  	if weights == nil {
   315  		for i, xv := range x {
   316  			yv := y[i]
   317  			xd := xv - xu
   318  			yd := yv - yu
   319  			ss += xd * yd
   320  			xcompensation += xd
   321  			ycompensation += yd
   322  		}
   323  		// xcompensation and ycompensation are from Chan, et. al.
   324  		// referenced in the MeanVariance function. They are analogous
   325  		// to the second term in (1.7) in that paper.
   326  		return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1)
   327  	}
   328  
   329  	var sumWeights float64
   330  
   331  	for i, xv := range x {
   332  		w := weights[i]
   333  		yv := y[i]
   334  		wxd := w * (xv - xu)
   335  		yd := (yv - yu)
   336  		ss += wxd * yd
   337  		xcompensation += wxd
   338  		ycompensation += w * yd
   339  		sumWeights += w
   340  	}
   341  	// xcompensation and ycompensation are from Chan, et. al.
   342  	// referenced in the MeanVariance function. They are analogous
   343  	// to the second term in (1.7) in that paper, except they use
   344  	// the sumWeights instead of the sample count.
   345  	return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1)
   346  }
   347  
   348  // CrossEntropy computes the cross-entropy between the two distributions specified
   349  // in p and q.
   350  func CrossEntropy(p, q []float64) float64 {
   351  	if len(p) != len(q) {
   352  		panic("stat: slice length mismatch")
   353  	}
   354  	var ce float64
   355  	for i, v := range p {
   356  		if v != 0 {
   357  			ce -= v * math.Log(q[i])
   358  		}
   359  	}
   360  	return ce
   361  }
   362  
   363  // Entropy computes the Shannon entropy of a distribution or the distance between
   364  // two distributions. The natural logarithm is used.
   365  //   - sum_i (p_i * log_e(p_i))
   366  func Entropy(p []float64) float64 {
   367  	var e float64
   368  	for _, v := range p {
   369  		if v != 0 { // Entropy needs 0 * log(0) == 0.
   370  			e -= v * math.Log(v)
   371  		}
   372  	}
   373  	return e
   374  }
   375  
   376  // ExKurtosis returns the population excess kurtosis of the sample.
   377  // The kurtosis is defined by the 4th moment of the mean divided by the squared
   378  // variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of
   379  // the normal distribution is zero.
   380  // If weights is nil then all of the weights are 1. If weights is not nil, then
   381  // len(x) must equal len(weights).
   382  func ExKurtosis(x, weights []float64) float64 {
   383  	mean, std := MeanStdDev(x, weights)
   384  	if weights == nil {
   385  		var e float64
   386  		for _, v := range x {
   387  			z := (v - mean) / std
   388  			e += z * z * z * z
   389  		}
   390  		mul, offset := kurtosisCorrection(float64(len(x)))
   391  		return e*mul - offset
   392  	}
   393  
   394  	var (
   395  		e          float64
   396  		sumWeights float64
   397  	)
   398  	for i, v := range x {
   399  		z := (v - mean) / std
   400  		e += weights[i] * z * z * z * z
   401  		sumWeights += weights[i]
   402  	}
   403  	mul, offset := kurtosisCorrection(sumWeights)
   404  	return e*mul - offset
   405  }
   406  
   407  // n is the number of samples
   408  // see https://en.wikipedia.org/wiki/Kurtosis
   409  func kurtosisCorrection(n float64) (mul, offset float64) {
   410  	return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3))
   411  }
   412  
   413  // GeometricMean returns the weighted geometric mean of the dataset
   414  //
   415  //	\prod_i {x_i ^ w_i}
   416  //
   417  // This only applies with positive x and positive weights. If weights is nil
   418  // then all of the weights are 1. If weights is not nil, then len(x) must equal
   419  // len(weights).
   420  func GeometricMean(x, weights []float64) float64 {
   421  	if weights == nil {
   422  		var s float64
   423  		for _, v := range x {
   424  			s += math.Log(v)
   425  		}
   426  		s /= float64(len(x))
   427  		return math.Exp(s)
   428  	}
   429  	if len(x) != len(weights) {
   430  		panic("stat: slice length mismatch")
   431  	}
   432  	var (
   433  		s          float64
   434  		sumWeights float64
   435  	)
   436  	for i, v := range x {
   437  		s += weights[i] * math.Log(v)
   438  		sumWeights += weights[i]
   439  	}
   440  	s /= sumWeights
   441  	return math.Exp(s)
   442  }
   443  
   444  // HarmonicMean returns the weighted harmonic mean of the dataset
   445  //
   446  //	\sum_i {w_i} / ( sum_i {w_i / x_i} )
   447  //
   448  // This only applies with positive x and positive weights.
   449  // If weights is nil then all of the weights are 1. If weights is not nil, then
   450  // len(x) must equal len(weights).
   451  func HarmonicMean(x, weights []float64) float64 {
   452  	if weights != nil && len(x) != len(weights) {
   453  		panic("stat: slice length mismatch")
   454  	}
   455  	// TODO(btracey): Fix this to make it more efficient and avoid allocation.
   456  
   457  	// This can be numerically unstable (for example if x is very small).
   458  	// W = \sum_i {w_i}
   459  	// hm = exp(log(W) - log(\sum_i w_i / x_i))
   460  
   461  	logs := make([]float64, len(x))
   462  	var W float64
   463  	for i := range x {
   464  		if weights == nil {
   465  			logs[i] = -math.Log(x[i])
   466  			W++
   467  			continue
   468  		}
   469  		logs[i] = math.Log(weights[i]) - math.Log(x[i])
   470  		W += weights[i]
   471  	}
   472  
   473  	// Sum all of the logs
   474  	v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}).
   475  	return math.Exp(math.Log(W) - v)
   476  }
   477  
   478  // Hellinger computes the distance between the probability distributions p and q given by:
   479  //
   480  //	\sqrt{ 1 - \sum_i \sqrt{p_i q_i} }
   481  //
   482  // The lengths of p and q must be equal. It is assumed that p and q sum to 1.
   483  func Hellinger(p, q []float64) float64 {
   484  	if len(p) != len(q) {
   485  		panic("stat: slice length mismatch")
   486  	}
   487  	bc := bhattacharyyaCoeff(p, q)
   488  	return math.Sqrt(1 - bc)
   489  }
   490  
   491  // Histogram sums up the weighted number of data points in each bin.
   492  // The weight of data point x[i] will be placed into count[j] if
   493  // dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
   494  // with bin creation.
   495  //
   496  // The following conditions on the inputs apply:
   497  //   - The count variable must either be nil or have length of one less than dividers.
   498  //   - The values in dividers must be sorted (use the sort package).
   499  //   - The x values must be sorted.
   500  //   - If weights is nil then all of the weights are 1.
   501  //   - If weights is not nil, then len(x) must equal len(weights).
   502  func Histogram(count, dividers, x, weights []float64) []float64 {
   503  	if weights != nil && len(x) != len(weights) {
   504  		panic("stat: slice length mismatch")
   505  	}
   506  	if count == nil {
   507  		count = make([]float64, len(dividers)-1)
   508  	}
   509  	if len(dividers) < 2 {
   510  		panic("histogram: fewer than two dividers")
   511  	}
   512  	if len(count) != len(dividers)-1 {
   513  		panic("histogram: bin count mismatch")
   514  	}
   515  	if !sort.Float64sAreSorted(dividers) {
   516  		panic("histogram: dividers are not sorted")
   517  	}
   518  	if !sort.Float64sAreSorted(x) {
   519  		panic("histogram: x data are not sorted")
   520  	}
   521  	for i := range count {
   522  		count[i] = 0
   523  	}
   524  	if len(x) == 0 {
   525  		return count
   526  	}
   527  	if x[0] < dividers[0] {
   528  		panic("histogram: minimum x value is less than lowest divider")
   529  	}
   530  	if dividers[len(dividers)-1] <= x[len(x)-1] {
   531  		panic("histogram: maximum x value is greater than or equal to highest divider")
   532  	}
   533  
   534  	idx := 0
   535  	comp := dividers[idx+1]
   536  	if weights == nil {
   537  		for _, v := range x {
   538  			if v < comp {
   539  				// Still in the current bucket.
   540  				count[idx]++
   541  				continue
   542  			}
   543  			// Find the next divider where v is less than the divider.
   544  			for j := idx + 1; j < len(dividers); j++ {
   545  				if v < dividers[j+1] {
   546  					idx = j
   547  					comp = dividers[j+1]
   548  					break
   549  				}
   550  			}
   551  			count[idx]++
   552  		}
   553  		return count
   554  	}
   555  
   556  	for i, v := range x {
   557  		if v < comp {
   558  			// Still in the current bucket.
   559  			count[idx] += weights[i]
   560  			continue
   561  		}
   562  		// Need to find the next divider where v is less than the divider.
   563  		for j := idx + 1; j < len(count); j++ {
   564  			if v < dividers[j+1] {
   565  				idx = j
   566  				comp = dividers[j+1]
   567  				break
   568  			}
   569  		}
   570  		count[idx] += weights[i]
   571  	}
   572  	return count
   573  }
   574  
   575  // JensenShannon computes the JensenShannon divergence between the distributions
   576  // p and q. The Jensen-Shannon divergence is defined as
   577  //
   578  //	m = 0.5 * (p + q)
   579  //	JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) )
   580  //
   581  // Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value
   582  // is between 0 and ln(2).
   583  func JensenShannon(p, q []float64) float64 {
   584  	if len(p) != len(q) {
   585  		panic("stat: slice length mismatch")
   586  	}
   587  	var js float64
   588  	for i, v := range p {
   589  		qi := q[i]
   590  		m := 0.5 * (v + qi)
   591  		if v != 0 {
   592  			// add kl from p to m
   593  			js += 0.5 * v * (math.Log(v) - math.Log(m))
   594  		}
   595  		if qi != 0 {
   596  			// add kl from q to m
   597  			js += 0.5 * qi * (math.Log(qi) - math.Log(m))
   598  		}
   599  	}
   600  	return js
   601  }
   602  
   603  // KolmogorovSmirnov computes the largest distance between two empirical CDFs.
   604  // Each dataset x and y consists of sample locations and counts, xWeights and
   605  // yWeights, respectively.
   606  //
   607  // x and y may have different lengths, though len(x) must equal len(xWeights), and
   608  // len(y) must equal len(yWeights). Both x and y must be sorted.
   609  //
   610  // Special cases are:
   611  //
   612  //	= 0 if len(x) == len(y) == 0
   613  //	= 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
   614  func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
   615  	if xWeights != nil && len(x) != len(xWeights) {
   616  		panic("stat: slice length mismatch")
   617  	}
   618  	if yWeights != nil && len(y) != len(yWeights) {
   619  		panic("stat: slice length mismatch")
   620  	}
   621  	if len(x) == 0 || len(y) == 0 {
   622  		if len(x) == 0 && len(y) == 0 {
   623  			return 0
   624  		}
   625  		return 1
   626  	}
   627  
   628  	if floats.HasNaN(x) {
   629  		return math.NaN()
   630  	}
   631  	if floats.HasNaN(y) {
   632  		return math.NaN()
   633  	}
   634  
   635  	if !sort.Float64sAreSorted(x) {
   636  		panic("x data are not sorted")
   637  	}
   638  	if !sort.Float64sAreSorted(y) {
   639  		panic("y data are not sorted")
   640  	}
   641  
   642  	xWeightsNil := xWeights == nil
   643  	yWeightsNil := yWeights == nil
   644  
   645  	var (
   646  		maxDist    float64
   647  		xSum, ySum float64
   648  		xCdf, yCdf float64
   649  		xIdx, yIdx int
   650  	)
   651  
   652  	if xWeightsNil {
   653  		xSum = float64(len(x))
   654  	} else {
   655  		xSum = floats.Sum(xWeights)
   656  	}
   657  
   658  	if yWeightsNil {
   659  		ySum = float64(len(y))
   660  	} else {
   661  		ySum = floats.Sum(yWeights)
   662  	}
   663  
   664  	xVal := x[0]
   665  	yVal := y[0]
   666  
   667  	// Algorithm description:
   668  	// The goal is to find the maximum difference in the empirical CDFs for the
   669  	// two datasets. The CDFs are piecewise-constant, and thus the distance
   670  	// between the CDFs will only change at the values themselves.
   671  	//
   672  	// To find the maximum distance, step through the data in ascending order
   673  	// of value between the two datasets. At each step, compute the empirical CDF
   674  	// and compare the local distance with the maximum distance.
   675  	// Due to some corner cases, equal data entries must be tallied simultaneously.
   676  	for {
   677  		switch {
   678  		case xVal < yVal:
   679  			xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
   680  		case yVal < xVal:
   681  			yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
   682  		case xVal == yVal:
   683  			newX := x[xIdx]
   684  			newY := y[yIdx]
   685  			if newX < newY {
   686  				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
   687  			} else if newY < newX {
   688  				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
   689  			} else {
   690  				// Update them both, they'll be equal next time and the right
   691  				// thing will happen.
   692  				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
   693  				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
   694  			}
   695  		default:
   696  			panic("unreachable")
   697  		}
   698  
   699  		dist := math.Abs(xCdf - yCdf)
   700  		if dist > maxDist {
   701  			maxDist = dist
   702  		}
   703  
   704  		// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
   705  		// end of either sample list, the distance is as large as it can be.
   706  		if xIdx == len(x) || yIdx == len(y) {
   707  			return maxDist
   708  		}
   709  	}
   710  }
   711  
   712  // updateKS gets the next data point from one of the set. In doing so, it combines
   713  // the weight of all the data points of equal value. Upon return, val is the new
   714  // value of the data set, newCdf is the total combined CDF up until this point,
   715  // and newIdx is the index of the next location in that sample to examine.
   716  func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
   717  	// Sum up all the weights of consecutive values that are equal.
   718  	if isNil {
   719  		newCdf = cdf + 1/sum
   720  	} else {
   721  		newCdf = cdf + weights[idx]/sum
   722  	}
   723  	newIdx = idx + 1
   724  	for {
   725  		if newIdx == len(values) {
   726  			return values[newIdx-1], newCdf, newIdx
   727  		}
   728  		if values[newIdx-1] != values[newIdx] {
   729  			return values[newIdx], newCdf, newIdx
   730  		}
   731  		if isNil {
   732  			newCdf += 1 / sum
   733  		} else {
   734  			newCdf += weights[newIdx] / sum
   735  		}
   736  		newIdx++
   737  	}
   738  }
   739  
   740  // KullbackLeibler computes the Kullback-Leibler distance between the
   741  // distributions p and q. The natural logarithm is used.
   742  //
   743  //	sum_i(p_i * log(p_i / q_i))
   744  //
   745  // Note that the Kullback-Leibler distance is not symmetric;
   746  // KullbackLeibler(p,q) != KullbackLeibler(q,p)
   747  func KullbackLeibler(p, q []float64) float64 {
   748  	if len(p) != len(q) {
   749  		panic("stat: slice length mismatch")
   750  	}
   751  	var kl float64
   752  	for i, v := range p {
   753  		if v != 0 { // Entropy needs 0 * log(0) == 0.
   754  			kl += v * (math.Log(v) - math.Log(q[i]))
   755  		}
   756  	}
   757  	return kl
   758  }
   759  
   760  // LinearRegression computes the best-fit line
   761  //
   762  //	y = alpha + beta*x
   763  //
   764  // to the data in x and y with the given weights. If origin is true, the
   765  // regression is forced to pass through the origin.
   766  //
   767  // Specifically, LinearRegression computes the values of alpha and
   768  // beta such that the total residual
   769  //
   770  //	\sum_i w[i]*(y[i] - alpha - beta*x[i])^2
   771  //
   772  // is minimized. If origin is true, then alpha is forced to be zero.
   773  //
   774  // The lengths of x and y must be equal. If weights is nil then all of the
   775  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   776  func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) {
   777  	if len(x) != len(y) {
   778  		panic("stat: slice length mismatch")
   779  	}
   780  	if weights != nil && len(weights) != len(x) {
   781  		panic("stat: slice length mismatch")
   782  	}
   783  
   784  	w := 1.0
   785  	if origin {
   786  		var x2Sum, xySum float64
   787  		for i, xi := range x {
   788  			if weights != nil {
   789  				w = weights[i]
   790  			}
   791  			yi := y[i]
   792  			xySum += w * xi * yi
   793  			x2Sum += w * xi * xi
   794  		}
   795  		beta = xySum / x2Sum
   796  
   797  		return 0, beta
   798  	}
   799  
   800  	xu, xv := MeanVariance(x, weights)
   801  	yu := Mean(y, weights)
   802  	cov := covarianceMeans(x, y, weights, xu, yu)
   803  	beta = cov / xv
   804  	alpha = yu - beta*xu
   805  	return alpha, beta
   806  }
   807  
   808  // RSquared returns the coefficient of determination defined as
   809  //
   810  //	R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2
   811  //
   812  // for the line
   813  //
   814  //	y = alpha + beta*x
   815  //
   816  // and the data in x and y with the given weights.
   817  //
   818  // The lengths of x and y must be equal. If weights is nil then all of the
   819  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   820  func RSquared(x, y, weights []float64, alpha, beta float64) float64 {
   821  	if len(x) != len(y) {
   822  		panic("stat: slice length mismatch")
   823  	}
   824  	if weights != nil && len(weights) != len(x) {
   825  		panic("stat: slice length mismatch")
   826  	}
   827  
   828  	w := 1.0
   829  	yMean := Mean(y, weights)
   830  	var res, tot, d float64
   831  	for i, xi := range x {
   832  		if weights != nil {
   833  			w = weights[i]
   834  		}
   835  		yi := y[i]
   836  		fi := alpha + beta*xi
   837  		d = yi - fi
   838  		res += w * d * d
   839  		d = yi - yMean
   840  		tot += w * d * d
   841  	}
   842  	return 1 - res/tot
   843  }
   844  
   845  // RSquaredFrom returns the coefficient of determination defined as
   846  //
   847  //	R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2
   848  //
   849  // and the data in estimates and values with the given weights.
   850  //
   851  // The lengths of estimates and values must be equal. If weights is nil then
   852  // all of the weights are 1. If weights is not nil, then len(values) must
   853  // equal len(weights).
   854  func RSquaredFrom(estimates, values, weights []float64) float64 {
   855  	if len(estimates) != len(values) {
   856  		panic("stat: slice length mismatch")
   857  	}
   858  	if weights != nil && len(weights) != len(values) {
   859  		panic("stat: slice length mismatch")
   860  	}
   861  
   862  	w := 1.0
   863  	mean := Mean(values, weights)
   864  	var res, tot, d float64
   865  	for i, val := range values {
   866  		if weights != nil {
   867  			w = weights[i]
   868  		}
   869  		d = val - estimates[i]
   870  		res += w * d * d
   871  		d = val - mean
   872  		tot += w * d * d
   873  	}
   874  	return 1 - res/tot
   875  }
   876  
   877  // RNoughtSquared returns the coefficient of determination defined as
   878  //
   879  //	R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2
   880  //
   881  // for the line
   882  //
   883  //	y = beta*x
   884  //
   885  // and the data in x and y with the given weights. RNoughtSquared should
   886  // only be used for best-fit lines regressed through the origin.
   887  //
   888  // The lengths of x and y must be equal. If weights is nil then all of the
   889  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   890  func RNoughtSquared(x, y, weights []float64, beta float64) float64 {
   891  	if len(x) != len(y) {
   892  		panic("stat: slice length mismatch")
   893  	}
   894  	if weights != nil && len(weights) != len(x) {
   895  		panic("stat: slice length mismatch")
   896  	}
   897  
   898  	w := 1.0
   899  	var ssr, tot float64
   900  	for i, xi := range x {
   901  		if weights != nil {
   902  			w = weights[i]
   903  		}
   904  		fi := beta * xi
   905  		ssr += w * fi * fi
   906  		yi := y[i]
   907  		tot += w * yi * yi
   908  	}
   909  	return ssr / tot
   910  }
   911  
   912  // Mean computes the weighted mean of the data set.
   913  //
   914  //	sum_i {w_i * x_i} / sum_i {w_i}
   915  //
   916  // If weights is nil then all of the weights are 1. If weights is not nil, then
   917  // len(x) must equal len(weights).
   918  func Mean(x, weights []float64) float64 {
   919  	if weights == nil {
   920  		return floats.Sum(x) / float64(len(x))
   921  	}
   922  	if len(x) != len(weights) {
   923  		panic("stat: slice length mismatch")
   924  	}
   925  	var (
   926  		sumValues  float64
   927  		sumWeights float64
   928  	)
   929  	for i, w := range weights {
   930  		sumValues += w * x[i]
   931  		sumWeights += w
   932  	}
   933  	return sumValues / sumWeights
   934  }
   935  
   936  // Mode returns the most common value in the dataset specified by x and the
   937  // given weights. Strict float64 equality is used when comparing values, so users
   938  // should take caution. If several values are the mode, any of them may be returned.
   939  func Mode(x, weights []float64) (val float64, count float64) {
   940  	if weights != nil && len(x) != len(weights) {
   941  		panic("stat: slice length mismatch")
   942  	}
   943  	if len(x) == 0 {
   944  		return 0, 0
   945  	}
   946  	m := make(map[float64]float64)
   947  	if weights == nil {
   948  		for _, v := range x {
   949  			m[v]++
   950  		}
   951  	} else {
   952  		for i, v := range x {
   953  			m[v] += weights[i]
   954  		}
   955  	}
   956  	var (
   957  		maxCount float64
   958  		max      float64
   959  	)
   960  	for val, count := range m {
   961  		if count > maxCount {
   962  			maxCount = count
   963  			max = val
   964  		}
   965  	}
   966  	return max, maxCount
   967  }
   968  
   969  // BivariateMoment computes the weighted mixed moment between the samples x and y.
   970  //
   971  //	E[(x - μ_x)^r*(y - μ_y)^s]
   972  //
   973  // No degrees of freedom correction is done.
   974  // The lengths of x and y must be equal. If weights is nil then all of the
   975  // weights are 1. If weights is not nil, then len(x) must equal len(weights).
   976  func BivariateMoment(r, s float64, x, y, weights []float64) float64 {
   977  	meanX := Mean(x, weights)
   978  	meanY := Mean(y, weights)
   979  	if len(x) != len(y) {
   980  		panic("stat: slice length mismatch")
   981  	}
   982  	if weights == nil {
   983  		var m float64
   984  		for i, vx := range x {
   985  			vy := y[i]
   986  			m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
   987  		}
   988  		return m / float64(len(x))
   989  	}
   990  	if len(weights) != len(x) {
   991  		panic("stat: slice length mismatch")
   992  	}
   993  	var (
   994  		m          float64
   995  		sumWeights float64
   996  	)
   997  	for i, vx := range x {
   998  		vy := y[i]
   999  		w := weights[i]
  1000  		m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
  1001  		sumWeights += w
  1002  	}
  1003  	return m / sumWeights
  1004  }
  1005  
  1006  // Moment computes the weighted n^th moment of the samples,
  1007  //
  1008  //	E[(x - μ)^N]
  1009  //
  1010  // No degrees of freedom correction is done.
  1011  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1012  // len(x) must equal len(weights).
  1013  func Moment(moment float64, x, weights []float64) float64 {
  1014  	// This also checks that x and weights have the same length.
  1015  	mean := Mean(x, weights)
  1016  	if weights == nil {
  1017  		var m float64
  1018  		for _, v := range x {
  1019  			m += math.Pow(v-mean, moment)
  1020  		}
  1021  		return m / float64(len(x))
  1022  	}
  1023  	var (
  1024  		m          float64
  1025  		sumWeights float64
  1026  	)
  1027  	for i, v := range x {
  1028  		w := weights[i]
  1029  		m += w * math.Pow(v-mean, moment)
  1030  		sumWeights += w
  1031  	}
  1032  	return m / sumWeights
  1033  }
  1034  
  1035  // MomentAbout computes the weighted n^th weighted moment of the samples about
  1036  // the given mean \mu,
  1037  //
  1038  //	E[(x - μ)^N]
  1039  //
  1040  // No degrees of freedom correction is done.
  1041  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1042  // len(x) must equal len(weights).
  1043  func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 {
  1044  	if weights == nil {
  1045  		var m float64
  1046  		for _, v := range x {
  1047  			m += math.Pow(v-mean, moment)
  1048  		}
  1049  		m /= float64(len(x))
  1050  		return m
  1051  	}
  1052  	if len(weights) != len(x) {
  1053  		panic("stat: slice length mismatch")
  1054  	}
  1055  	var (
  1056  		m          float64
  1057  		sumWeights float64
  1058  	)
  1059  	for i, v := range x {
  1060  		m += weights[i] * math.Pow(v-mean, moment)
  1061  		sumWeights += weights[i]
  1062  	}
  1063  	return m / sumWeights
  1064  }
  1065  
  1066  // Quantile returns the sample of x such that x is greater than or
  1067  // equal to the fraction p of samples. The exact behavior is determined by the
  1068  // CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically
  1069  // the inverse of the CDF function, though it may not be the actual inverse
  1070  // for all values p and CumulantKinds.
  1071  //
  1072  // The x data must be sorted in increasing order. If weights is nil then all
  1073  // of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
  1074  // Quantile will panic if the length of x is zero.
  1075  //
  1076  // CumulantKind behaviors:
  1077  //   - Empirical: Returns the lowest value q for which q is greater than or equal
  1078  //     to the fraction p of samples
  1079  //   - LinInterp: Returns the linearly interpolated value
  1080  func Quantile(p float64, c CumulantKind, x, weights []float64) float64 {
  1081  	if !(p >= 0 && p <= 1) {
  1082  		panic("stat: percentile out of bounds")
  1083  	}
  1084  
  1085  	if weights != nil && len(x) != len(weights) {
  1086  		panic("stat: slice length mismatch")
  1087  	}
  1088  	if len(x) == 0 {
  1089  		panic("stat: zero length slice")
  1090  	}
  1091  	if floats.HasNaN(x) {
  1092  		return math.NaN() // This is needed because the algorithm breaks otherwise.
  1093  	}
  1094  	if !sort.Float64sAreSorted(x) {
  1095  		panic("x data are not sorted")
  1096  	}
  1097  
  1098  	var sumWeights float64
  1099  	if weights == nil {
  1100  		sumWeights = float64(len(x))
  1101  	} else {
  1102  		sumWeights = floats.Sum(weights)
  1103  	}
  1104  	switch c {
  1105  	case Empirical:
  1106  		return empiricalQuantile(p, x, weights, sumWeights)
  1107  	case LinInterp:
  1108  		return linInterpQuantile(p, x, weights, sumWeights)
  1109  	default:
  1110  		panic("stat: bad cumulant kind")
  1111  	}
  1112  }
  1113  
  1114  func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
  1115  	var cumsum float64
  1116  	fidx := p * sumWeights
  1117  	for i := range x {
  1118  		if weights == nil {
  1119  			cumsum++
  1120  		} else {
  1121  			cumsum += weights[i]
  1122  		}
  1123  		if cumsum >= fidx {
  1124  			return x[i]
  1125  		}
  1126  	}
  1127  	panic("impossible")
  1128  }
  1129  
  1130  func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
  1131  	var cumsum float64
  1132  	fidx := p * sumWeights
  1133  	for i := range x {
  1134  		if weights == nil {
  1135  			cumsum++
  1136  		} else {
  1137  			cumsum += weights[i]
  1138  		}
  1139  		if cumsum >= fidx {
  1140  			if i == 0 {
  1141  				return x[0]
  1142  			}
  1143  			t := cumsum - fidx
  1144  			if weights != nil {
  1145  				t /= weights[i]
  1146  			}
  1147  			return t*x[i-1] + (1-t)*x[i]
  1148  		}
  1149  	}
  1150  	panic("impossible")
  1151  }
  1152  
  1153  // Skew computes the skewness of the sample data.
  1154  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1155  // len(x) must equal len(weights).
  1156  // When weights sum to 1 or less, a biased variance estimator should be used.
  1157  func Skew(x, weights []float64) float64 {
  1158  
  1159  	mean, std := MeanStdDev(x, weights)
  1160  	if weights == nil {
  1161  		var s float64
  1162  		for _, v := range x {
  1163  			z := (v - mean) / std
  1164  			s += z * z * z
  1165  		}
  1166  		return s * skewCorrection(float64(len(x)))
  1167  	}
  1168  	var (
  1169  		s          float64
  1170  		sumWeights float64
  1171  	)
  1172  	for i, v := range x {
  1173  		z := (v - mean) / std
  1174  		s += weights[i] * z * z * z
  1175  		sumWeights += weights[i]
  1176  	}
  1177  	return s * skewCorrection(sumWeights)
  1178  }
  1179  
  1180  // From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7
  1181  func skewCorrection(n float64) float64 {
  1182  	return (n / (n - 1)) * (1 / (n - 2))
  1183  }
  1184  
  1185  // SortWeighted rearranges the data in x along with their corresponding
  1186  // weights so that the x data are sorted. The data is sorted in place.
  1187  // Weights may be nil, but if weights is non-nil then it must have the same
  1188  // length as x.
  1189  func SortWeighted(x, weights []float64) {
  1190  	if weights == nil {
  1191  		sort.Float64s(x)
  1192  		return
  1193  	}
  1194  	if len(x) != len(weights) {
  1195  		panic("stat: slice length mismatch")
  1196  	}
  1197  	sort.Sort(weightSorter{
  1198  		x: x,
  1199  		w: weights,
  1200  	})
  1201  }
  1202  
  1203  type weightSorter struct {
  1204  	x []float64
  1205  	w []float64
  1206  }
  1207  
  1208  func (w weightSorter) Len() int           { return len(w.x) }
  1209  func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] }
  1210  func (w weightSorter) Swap(i, j int) {
  1211  	w.x[i], w.x[j] = w.x[j], w.x[i]
  1212  	w.w[i], w.w[j] = w.w[j], w.w[i]
  1213  }
  1214  
  1215  // SortWeightedLabeled rearranges the data in x along with their
  1216  // corresponding weights and boolean labels so that the x data are sorted.
  1217  // The data is sorted in place. Weights and labels may be nil, if either
  1218  // is non-nil it must have the same length as x.
  1219  func SortWeightedLabeled(x []float64, labels []bool, weights []float64) {
  1220  	if labels == nil {
  1221  		SortWeighted(x, weights)
  1222  		return
  1223  	}
  1224  	if weights == nil {
  1225  		if len(x) != len(labels) {
  1226  			panic("stat: slice length mismatch")
  1227  		}
  1228  		sort.Sort(labelSorter{
  1229  			x: x,
  1230  			l: labels,
  1231  		})
  1232  		return
  1233  	}
  1234  	if len(x) != len(labels) || len(x) != len(weights) {
  1235  		panic("stat: slice length mismatch")
  1236  	}
  1237  	sort.Sort(weightLabelSorter{
  1238  		x: x,
  1239  		l: labels,
  1240  		w: weights,
  1241  	})
  1242  }
  1243  
  1244  type labelSorter struct {
  1245  	x []float64
  1246  	l []bool
  1247  }
  1248  
  1249  func (a labelSorter) Len() int           { return len(a.x) }
  1250  func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
  1251  func (a labelSorter) Swap(i, j int) {
  1252  	a.x[i], a.x[j] = a.x[j], a.x[i]
  1253  	a.l[i], a.l[j] = a.l[j], a.l[i]
  1254  }
  1255  
  1256  type weightLabelSorter struct {
  1257  	x []float64
  1258  	l []bool
  1259  	w []float64
  1260  }
  1261  
  1262  func (a weightLabelSorter) Len() int           { return len(a.x) }
  1263  func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
  1264  func (a weightLabelSorter) Swap(i, j int) {
  1265  	a.x[i], a.x[j] = a.x[j], a.x[i]
  1266  	a.l[i], a.l[j] = a.l[j], a.l[i]
  1267  	a.w[i], a.w[j] = a.w[j], a.w[i]
  1268  }
  1269  
  1270  // StdDev returns the sample standard deviation.
  1271  func StdDev(x, weights []float64) float64 {
  1272  	_, std := MeanStdDev(x, weights)
  1273  	return std
  1274  }
  1275  
  1276  // MeanStdDev returns the sample mean and unbiased standard deviation
  1277  // When weights sum to 1 or less, a biased variance estimator should be used.
  1278  func MeanStdDev(x, weights []float64) (mean, std float64) {
  1279  	mean, variance := MeanVariance(x, weights)
  1280  	return mean, math.Sqrt(variance)
  1281  }
  1282  
  1283  // StdErr returns the standard error in the mean with the given values.
  1284  func StdErr(std, sampleSize float64) float64 {
  1285  	return std / math.Sqrt(sampleSize)
  1286  }
  1287  
  1288  // StdScore returns the standard score (a.k.a. z-score, z-value) for the value x
  1289  // with the given mean and standard deviation, i.e.
  1290  //
  1291  //	(x - mean) / std
  1292  func StdScore(x, mean, std float64) float64 {
  1293  	return (x - mean) / std
  1294  }
  1295  
  1296  // Variance computes the unbiased weighted sample variance:
  1297  //
  1298  //	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
  1299  //
  1300  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1301  // len(x) must equal len(weights).
  1302  // When weights sum to 1 or less, a biased variance estimator should be used.
  1303  func Variance(x, weights []float64) float64 {
  1304  	_, variance := MeanVariance(x, weights)
  1305  	return variance
  1306  }
  1307  
  1308  // MeanVariance computes the sample mean and unbiased variance, where the mean and variance are
  1309  //
  1310  //	\sum_i w_i * x_i / (sum_i w_i)
  1311  //	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
  1312  //
  1313  // respectively.
  1314  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1315  // len(x) must equal len(weights).
  1316  // When weights sum to 1 or less, a biased variance estimator should be used.
  1317  func MeanVariance(x, weights []float64) (mean, variance float64) {
  1318  	var (
  1319  		unnormalisedVariance float64
  1320  		sumWeights           float64
  1321  	)
  1322  	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
  1323  	return mean, unnormalisedVariance / (sumWeights - 1)
  1324  }
  1325  
  1326  // PopMeanVariance computes the sample mean and biased variance (also known as
  1327  // "population variance"), where the mean and variance are
  1328  //
  1329  //	\sum_i w_i * x_i / (sum_i w_i)
  1330  //	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
  1331  //
  1332  // respectively.
  1333  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1334  // len(x) must equal len(weights).
  1335  func PopMeanVariance(x, weights []float64) (mean, variance float64) {
  1336  	var (
  1337  		unnormalisedVariance float64
  1338  		sumWeights           float64
  1339  	)
  1340  	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
  1341  	return mean, unnormalisedVariance / sumWeights
  1342  }
  1343  
  1344  // PopMeanStdDev returns the sample mean and biased standard deviation
  1345  // (also known as "population standard deviation").
  1346  func PopMeanStdDev(x, weights []float64) (mean, std float64) {
  1347  	mean, variance := PopMeanVariance(x, weights)
  1348  	return mean, math.Sqrt(variance)
  1349  }
  1350  
  1351  // PopStdDev returns the population standard deviation, i.e., a square root
  1352  // of the biased variance estimate.
  1353  func PopStdDev(x, weights []float64) float64 {
  1354  	_, stDev := PopMeanStdDev(x, weights)
  1355  	return stDev
  1356  }
  1357  
  1358  // PopVariance computes the unbiased weighted sample variance:
  1359  //
  1360  //	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
  1361  //
  1362  // If weights is nil then all of the weights are 1. If weights is not nil, then
  1363  // len(x) must equal len(weights).
  1364  func PopVariance(x, weights []float64) float64 {
  1365  	_, variance := PopMeanVariance(x, weights)
  1366  	return variance
  1367  }
  1368  
  1369  func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) {
  1370  	// This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing
  1371  	// the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub,
  1372  	// and Randall J. LeVeque.
  1373  
  1374  	// Note that this will panic if the slice lengths do not match.
  1375  	mean = Mean(x, weights)
  1376  	var (
  1377  		ss           float64
  1378  		compensation float64
  1379  	)
  1380  	if weights == nil {
  1381  		for _, v := range x {
  1382  			d := v - mean
  1383  			ss += d * d
  1384  			compensation += d
  1385  		}
  1386  		unnormalisedVariance = (ss - compensation*compensation/float64(len(x)))
  1387  		return mean, unnormalisedVariance, float64(len(x))
  1388  	}
  1389  
  1390  	for i, v := range x {
  1391  		w := weights[i]
  1392  		d := v - mean
  1393  		wd := w * d
  1394  		ss += wd * d
  1395  		compensation += wd
  1396  		sumWeights += w
  1397  	}
  1398  	unnormalisedVariance = (ss - compensation*compensation/sumWeights)
  1399  	return mean, unnormalisedVariance, sumWeights
  1400  }