github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/stat/stat.go (about) 1 // Copyright ©2014 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package stat 6 7 import ( 8 "math" 9 "sort" 10 11 "github.com/jingcheng-WU/gonum/floats" 12 ) 13 14 // CumulantKind specifies the behavior for calculating the empirical CDF or Quantile 15 type CumulantKind int 16 17 // List of supported CumulantKind values for the Quantile function. 18 // Constant values should match the R nomenclature. See 19 // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population 20 const ( 21 // Empirical treats the distribution as the actual empirical distribution. 22 Empirical CumulantKind = 1 23 // LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation. 24 LinInterp CumulantKind = 4 25 ) 26 27 // bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by: 28 // \sum_i \sqrt{p_i q_i} 29 // 30 // It is assumed that p and q have equal length. 31 func bhattacharyyaCoeff(p, q []float64) float64 { 32 var bc float64 33 for i, a := range p { 34 bc += math.Sqrt(a * q[i]) 35 } 36 return bc 37 } 38 39 // Bhattacharyya computes the distance between the probability distributions p and q given by: 40 // -\ln ( \sum_i \sqrt{p_i q_i} ) 41 // 42 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 43 func Bhattacharyya(p, q []float64) float64 { 44 if len(p) != len(q) { 45 panic("stat: slice length mismatch") 46 } 47 bc := bhattacharyyaCoeff(p, q) 48 return -math.Log(bc) 49 } 50 51 // CDF returns the empirical cumulative distribution function value of x, that is 52 // the fraction of the samples less than or equal to q. The 53 // exact behavior is determined by the CumulantKind. CDF is theoretically 54 // the inverse of the Quantile function, though it may not be the actual inverse 55 // for all values q and CumulantKinds. 56 // 57 // The x data must be sorted in increasing order. If weights is nil then all 58 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 59 // 60 // CumulantKind behaviors: 61 // - Empirical: Returns the lowest fraction for which q is greater than or equal 62 // to that fraction of samples 63 func CDF(q float64, c CumulantKind, x, weights []float64) float64 { 64 if weights != nil && len(x) != len(weights) { 65 panic("stat: slice length mismatch") 66 } 67 if floats.HasNaN(x) { 68 return math.NaN() 69 } 70 if !sort.Float64sAreSorted(x) { 71 panic("x data are not sorted") 72 } 73 74 if q < x[0] { 75 return 0 76 } 77 if q >= x[len(x)-1] { 78 return 1 79 } 80 81 var sumWeights float64 82 if weights == nil { 83 sumWeights = float64(len(x)) 84 } else { 85 sumWeights = floats.Sum(weights) 86 } 87 88 // Calculate the index 89 switch c { 90 case Empirical: 91 // Find the smallest value that is greater than that percent of the samples 92 var w float64 93 for i, v := range x { 94 if v > q { 95 return w / sumWeights 96 } 97 if weights == nil { 98 w++ 99 } else { 100 w += weights[i] 101 } 102 } 103 panic("impossible") 104 default: 105 panic("stat: bad cumulant kind") 106 } 107 } 108 109 // ChiSquare computes the chi-square distance between the observed frequencies 'obs' and 110 // expected frequencies 'exp' given by: 111 // \sum_i (obs_i-exp_i)^2 / exp_i 112 // 113 // The lengths of obs and exp must be equal. 114 func ChiSquare(obs, exp []float64) float64 { 115 if len(obs) != len(exp) { 116 panic("stat: slice length mismatch") 117 } 118 var result float64 119 for i, a := range obs { 120 b := exp[i] 121 if a == 0 && b == 0 { 122 continue 123 } 124 result += (a - b) * (a - b) / b 125 } 126 return result 127 } 128 129 // CircularMean returns the circular mean of the dataset. 130 // atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i)) 131 // If weights is nil then all of the weights are 1. If weights is not nil, then 132 // len(x) must equal len(weights). 133 func CircularMean(x, weights []float64) float64 { 134 if weights != nil && len(x) != len(weights) { 135 panic("stat: slice length mismatch") 136 } 137 138 var aX, aY float64 139 if weights != nil { 140 for i, v := range x { 141 aX += weights[i] * math.Cos(v) 142 aY += weights[i] * math.Sin(v) 143 } 144 } else { 145 for _, v := range x { 146 aX += math.Cos(v) 147 aY += math.Sin(v) 148 } 149 } 150 151 return math.Atan2(aY, aX) 152 } 153 154 // Correlation returns the weighted correlation between the samples of x and y 155 // with the given means. 156 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY) 157 // The lengths of x and y must be equal. If weights is nil then all of the 158 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 159 func Correlation(x, y, weights []float64) float64 { 160 // This is a two-pass corrected implementation. It is an adaptation of the 161 // algorithm used in the MeanVariance function, which applies a correction 162 // to the typical two pass approach. 163 164 if len(x) != len(y) { 165 panic("stat: slice length mismatch") 166 } 167 xu := Mean(x, weights) 168 yu := Mean(y, weights) 169 var ( 170 sxx float64 171 syy float64 172 sxy float64 173 xcompensation float64 174 ycompensation float64 175 ) 176 if weights == nil { 177 for i, xv := range x { 178 yv := y[i] 179 xd := xv - xu 180 yd := yv - yu 181 sxx += xd * xd 182 syy += yd * yd 183 sxy += xd * yd 184 xcompensation += xd 185 ycompensation += yd 186 } 187 // xcompensation and ycompensation are from Chan, et. al. 188 // referenced in the MeanVariance function. They are analogous 189 // to the second term in (1.7) in that paper. 190 sxx -= xcompensation * xcompensation / float64(len(x)) 191 syy -= ycompensation * ycompensation / float64(len(x)) 192 193 return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy) 194 195 } 196 197 var sumWeights float64 198 for i, xv := range x { 199 w := weights[i] 200 yv := y[i] 201 xd := xv - xu 202 wxd := w * xd 203 yd := yv - yu 204 wyd := w * yd 205 sxx += wxd * xd 206 syy += wyd * yd 207 sxy += wxd * yd 208 xcompensation += wxd 209 ycompensation += wyd 210 sumWeights += w 211 } 212 // xcompensation and ycompensation are from Chan, et. al. 213 // referenced in the MeanVariance function. They are analogous 214 // to the second term in (1.7) in that paper, except they use 215 // the sumWeights instead of the sample count. 216 sxx -= xcompensation * xcompensation / sumWeights 217 syy -= ycompensation * ycompensation / sumWeights 218 219 return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy) 220 } 221 222 // Kendall returns the weighted Tau-a Kendall correlation between the 223 // samples of x and y. The Kendall correlation measures the quantity of 224 // concordant and discordant pairs of numbers. If weights are specified then 225 // each pair is weighted by weights[i] * weights[j] and the final sum is 226 // normalized to stay between -1 and 1. 227 // The lengths of x and y must be equal. If weights is nil then all of the 228 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 229 func Kendall(x, y, weights []float64) float64 { 230 if len(x) != len(y) { 231 panic("stat: slice length mismatch") 232 } 233 234 var ( 235 cc float64 // number of concordant pairs 236 dc float64 // number of discordant pairs 237 n = len(x) 238 ) 239 240 if weights == nil { 241 for i := 0; i < n; i++ { 242 for j := i; j < n; j++ { 243 if i == j { 244 continue 245 } 246 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 247 cc++ 248 } else { 249 dc++ 250 } 251 } 252 } 253 return (cc - dc) / float64(n*(n-1)/2) 254 } 255 256 var sumWeights float64 257 258 for i := 0; i < n; i++ { 259 for j := i; j < n; j++ { 260 if i == j { 261 continue 262 } 263 weight := weights[i] * weights[j] 264 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 265 cc += weight 266 } else { 267 dc += weight 268 } 269 sumWeights += weight 270 } 271 } 272 return float64(cc-dc) / sumWeights 273 } 274 275 // Covariance returns the weighted covariance between the samples of x and y. 276 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1) 277 // The lengths of x and y must be equal. If weights is nil then all of the 278 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 279 func Covariance(x, y, weights []float64) float64 { 280 // This is a two-pass corrected implementation. It is an adaptation of the 281 // algorithm used in the MeanVariance function, which applies a correction 282 // to the typical two pass approach. 283 284 if len(x) != len(y) { 285 panic("stat: slice length mismatch") 286 } 287 xu := Mean(x, weights) 288 yu := Mean(y, weights) 289 return covarianceMeans(x, y, weights, xu, yu) 290 } 291 292 // covarianceMeans returns the weighted covariance between x and y with the mean 293 // of x and y already specified. See the documentation of Covariance for more 294 // information. 295 func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 { 296 var ( 297 ss float64 298 xcompensation float64 299 ycompensation float64 300 ) 301 if weights == nil { 302 for i, xv := range x { 303 yv := y[i] 304 xd := xv - xu 305 yd := yv - yu 306 ss += xd * yd 307 xcompensation += xd 308 ycompensation += yd 309 } 310 // xcompensation and ycompensation are from Chan, et. al. 311 // referenced in the MeanVariance function. They are analogous 312 // to the second term in (1.7) in that paper. 313 return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1) 314 } 315 316 var sumWeights float64 317 318 for i, xv := range x { 319 w := weights[i] 320 yv := y[i] 321 wxd := w * (xv - xu) 322 yd := (yv - yu) 323 ss += wxd * yd 324 xcompensation += wxd 325 ycompensation += w * yd 326 sumWeights += w 327 } 328 // xcompensation and ycompensation are from Chan, et. al. 329 // referenced in the MeanVariance function. They are analogous 330 // to the second term in (1.7) in that paper, except they use 331 // the sumWeights instead of the sample count. 332 return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1) 333 } 334 335 // CrossEntropy computes the cross-entropy between the two distributions specified 336 // in p and q. 337 func CrossEntropy(p, q []float64) float64 { 338 if len(p) != len(q) { 339 panic("stat: slice length mismatch") 340 } 341 var ce float64 342 for i, v := range p { 343 if v != 0 { 344 ce -= v * math.Log(q[i]) 345 } 346 } 347 return ce 348 } 349 350 // Entropy computes the Shannon entropy of a distribution or the distance between 351 // two distributions. The natural logarithm is used. 352 // - sum_i (p_i * log_e(p_i)) 353 func Entropy(p []float64) float64 { 354 var e float64 355 for _, v := range p { 356 if v != 0 { // Entropy needs 0 * log(0) == 0. 357 e -= v * math.Log(v) 358 } 359 } 360 return e 361 } 362 363 // ExKurtosis returns the population excess kurtosis of the sample. 364 // The kurtosis is defined by the 4th moment of the mean divided by the squared 365 // variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of 366 // the normal distribution is zero. 367 // If weights is nil then all of the weights are 1. If weights is not nil, then 368 // len(x) must equal len(weights). 369 func ExKurtosis(x, weights []float64) float64 { 370 mean, std := MeanStdDev(x, weights) 371 if weights == nil { 372 var e float64 373 for _, v := range x { 374 z := (v - mean) / std 375 e += z * z * z * z 376 } 377 mul, offset := kurtosisCorrection(float64(len(x))) 378 return e*mul - offset 379 } 380 381 var ( 382 e float64 383 sumWeights float64 384 ) 385 for i, v := range x { 386 z := (v - mean) / std 387 e += weights[i] * z * z * z * z 388 sumWeights += weights[i] 389 } 390 mul, offset := kurtosisCorrection(sumWeights) 391 return e*mul - offset 392 } 393 394 // n is the number of samples 395 // see https://en.wikipedia.org/wiki/Kurtosis 396 func kurtosisCorrection(n float64) (mul, offset float64) { 397 return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3)) 398 } 399 400 // GeometricMean returns the weighted geometric mean of the dataset 401 // \prod_i {x_i ^ w_i} 402 // This only applies with positive x and positive weights. If weights is nil 403 // then all of the weights are 1. If weights is not nil, then len(x) must equal 404 // len(weights). 405 func GeometricMean(x, weights []float64) float64 { 406 if weights == nil { 407 var s float64 408 for _, v := range x { 409 s += math.Log(v) 410 } 411 s /= float64(len(x)) 412 return math.Exp(s) 413 } 414 if len(x) != len(weights) { 415 panic("stat: slice length mismatch") 416 } 417 var ( 418 s float64 419 sumWeights float64 420 ) 421 for i, v := range x { 422 s += weights[i] * math.Log(v) 423 sumWeights += weights[i] 424 } 425 s /= sumWeights 426 return math.Exp(s) 427 } 428 429 // HarmonicMean returns the weighted harmonic mean of the dataset 430 // \sum_i {w_i} / ( sum_i {w_i / x_i} ) 431 // This only applies with positive x and positive weights. 432 // If weights is nil then all of the weights are 1. If weights is not nil, then 433 // len(x) must equal len(weights). 434 func HarmonicMean(x, weights []float64) float64 { 435 if weights != nil && len(x) != len(weights) { 436 panic("stat: slice length mismatch") 437 } 438 // TODO(btracey): Fix this to make it more efficient and avoid allocation. 439 440 // This can be numerically unstable (for example if x is very small). 441 // W = \sum_i {w_i} 442 // hm = exp(log(W) - log(\sum_i w_i / x_i)) 443 444 logs := make([]float64, len(x)) 445 var W float64 446 for i := range x { 447 if weights == nil { 448 logs[i] = -math.Log(x[i]) 449 W++ 450 continue 451 } 452 logs[i] = math.Log(weights[i]) - math.Log(x[i]) 453 W += weights[i] 454 } 455 456 // Sum all of the logs 457 v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}). 458 return math.Exp(math.Log(W) - v) 459 } 460 461 // Hellinger computes the distance between the probability distributions p and q given by: 462 // \sqrt{ 1 - \sum_i \sqrt{p_i q_i} } 463 // 464 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 465 func Hellinger(p, q []float64) float64 { 466 if len(p) != len(q) { 467 panic("stat: slice length mismatch") 468 } 469 bc := bhattacharyyaCoeff(p, q) 470 return math.Sqrt(1 - bc) 471 } 472 473 // Histogram sums up the weighted number of data points in each bin. 474 // The weight of data point x[i] will be placed into count[j] if 475 // dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist 476 // with bin creation. 477 // 478 // The following conditions on the inputs apply: 479 // - The count variable must either be nil or have length of one less than dividers. 480 // - The values in dividers must be sorted (use the sort package). 481 // - The x values must be sorted. 482 // - If weights is nil then all of the weights are 1. 483 // - If weights is not nil, then len(x) must equal len(weights). 484 func Histogram(count, dividers, x, weights []float64) []float64 { 485 if weights != nil && len(x) != len(weights) { 486 panic("stat: slice length mismatch") 487 } 488 if count == nil { 489 count = make([]float64, len(dividers)-1) 490 } 491 if len(dividers) < 2 { 492 panic("histogram: fewer than two dividers") 493 } 494 if len(count) != len(dividers)-1 { 495 panic("histogram: bin count mismatch") 496 } 497 if !sort.Float64sAreSorted(dividers) { 498 panic("histogram: dividers are not sorted") 499 } 500 if !sort.Float64sAreSorted(x) { 501 panic("histogram: x data are not sorted") 502 } 503 for i := range count { 504 count[i] = 0 505 } 506 if len(x) == 0 { 507 return count 508 } 509 if x[0] < dividers[0] { 510 panic("histogram: minimum x value is less than lowest divider") 511 } 512 if dividers[len(dividers)-1] <= x[len(x)-1] { 513 panic("histogram: maximum x value is greater than or equal to highest divider") 514 } 515 516 idx := 0 517 comp := dividers[idx+1] 518 if weights == nil { 519 for _, v := range x { 520 if v < comp { 521 // Still in the current bucket. 522 count[idx]++ 523 continue 524 } 525 // Find the next divider where v is less than the divider. 526 for j := idx + 1; j < len(dividers); j++ { 527 if v < dividers[j+1] { 528 idx = j 529 comp = dividers[j+1] 530 break 531 } 532 } 533 count[idx]++ 534 } 535 return count 536 } 537 538 for i, v := range x { 539 if v < comp { 540 // Still in the current bucket. 541 count[idx] += weights[i] 542 continue 543 } 544 // Need to find the next divider where v is less than the divider. 545 for j := idx + 1; j < len(count); j++ { 546 if v < dividers[j+1] { 547 idx = j 548 comp = dividers[j+1] 549 break 550 } 551 } 552 count[idx] += weights[i] 553 } 554 return count 555 } 556 557 // JensenShannon computes the JensenShannon divergence between the distributions 558 // p and q. The Jensen-Shannon divergence is defined as 559 // m = 0.5 * (p + q) 560 // JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) ) 561 // Unlike Kullback-Liebler, the Jensen-Shannon distance is symmetric. The value 562 // is between 0 and ln(2). 563 func JensenShannon(p, q []float64) float64 { 564 if len(p) != len(q) { 565 panic("stat: slice length mismatch") 566 } 567 var js float64 568 for i, v := range p { 569 qi := q[i] 570 m := 0.5 * (v + qi) 571 if v != 0 { 572 // add kl from p to m 573 js += 0.5 * v * (math.Log(v) - math.Log(m)) 574 } 575 if qi != 0 { 576 // add kl from q to m 577 js += 0.5 * qi * (math.Log(qi) - math.Log(m)) 578 } 579 } 580 return js 581 } 582 583 // KolmogorovSmirnov computes the largest distance between two empirical CDFs. 584 // Each dataset x and y consists of sample locations and counts, xWeights and 585 // yWeights, respectively. 586 // 587 // x and y may have different lengths, though len(x) must equal len(xWeights), and 588 // len(y) must equal len(yWeights). Both x and y must be sorted. 589 // 590 // Special cases are: 591 // = 0 if len(x) == len(y) == 0 592 // = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0 593 func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 { 594 if xWeights != nil && len(x) != len(xWeights) { 595 panic("stat: slice length mismatch") 596 } 597 if yWeights != nil && len(y) != len(yWeights) { 598 panic("stat: slice length mismatch") 599 } 600 if len(x) == 0 || len(y) == 0 { 601 if len(x) == 0 && len(y) == 0 { 602 return 0 603 } 604 return 1 605 } 606 607 if floats.HasNaN(x) { 608 return math.NaN() 609 } 610 if floats.HasNaN(y) { 611 return math.NaN() 612 } 613 614 if !sort.Float64sAreSorted(x) { 615 panic("x data are not sorted") 616 } 617 if !sort.Float64sAreSorted(y) { 618 panic("y data are not sorted") 619 } 620 621 xWeightsNil := xWeights == nil 622 yWeightsNil := yWeights == nil 623 624 var ( 625 maxDist float64 626 xSum, ySum float64 627 xCdf, yCdf float64 628 xIdx, yIdx int 629 ) 630 631 if xWeightsNil { 632 xSum = float64(len(x)) 633 } else { 634 xSum = floats.Sum(xWeights) 635 } 636 637 if yWeightsNil { 638 ySum = float64(len(y)) 639 } else { 640 ySum = floats.Sum(yWeights) 641 } 642 643 xVal := x[0] 644 yVal := y[0] 645 646 // Algorithm description: 647 // The goal is to find the maximum difference in the empirical CDFs for the 648 // two datasets. The CDFs are piecewise-constant, and thus the distance 649 // between the CDFs will only change at the values themselves. 650 // 651 // To find the maximum distance, step through the data in ascending order 652 // of value between the two datasets. At each step, compute the empirical CDF 653 // and compare the local distance with the maximum distance. 654 // Due to some corner cases, equal data entries must be tallied simultaneously. 655 for { 656 switch { 657 case xVal < yVal: 658 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 659 case yVal < xVal: 660 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 661 case xVal == yVal: 662 newX := x[xIdx] 663 newY := y[yIdx] 664 if newX < newY { 665 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 666 } else if newY < newX { 667 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 668 } else { 669 // Update them both, they'll be equal next time and the right 670 // thing will happen. 671 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 672 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 673 } 674 default: 675 panic("unreachable") 676 } 677 678 dist := math.Abs(xCdf - yCdf) 679 if dist > maxDist { 680 maxDist = dist 681 } 682 683 // Both xCdf and yCdf will equal 1 at the end, so if we have reached the 684 // end of either sample list, the distance is as large as it can be. 685 if xIdx == len(x) || yIdx == len(y) { 686 return maxDist 687 } 688 } 689 } 690 691 // updateKS gets the next data point from one of the set. In doing so, it combines 692 // the weight of all the data points of equal value. Upon return, val is the new 693 // value of the data set, newCdf is the total combined CDF up until this point, 694 // and newIdx is the index of the next location in that sample to examine. 695 func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) { 696 // Sum up all the weights of consecutive values that are equal. 697 if isNil { 698 newCdf = cdf + 1/sum 699 } else { 700 newCdf = cdf + weights[idx]/sum 701 } 702 newIdx = idx + 1 703 for { 704 if newIdx == len(values) { 705 return values[newIdx-1], newCdf, newIdx 706 } 707 if values[newIdx-1] != values[newIdx] { 708 return values[newIdx], newCdf, newIdx 709 } 710 if isNil { 711 newCdf += 1 / sum 712 } else { 713 newCdf += weights[newIdx] / sum 714 } 715 newIdx++ 716 } 717 } 718 719 // KullbackLeibler computes the Kullback-Leibler distance between the 720 // distributions p and q. The natural logarithm is used. 721 // sum_i(p_i * log(p_i / q_i)) 722 // Note that the Kullback-Leibler distance is not symmetric; 723 // KullbackLeibler(p,q) != KullbackLeibler(q,p) 724 func KullbackLeibler(p, q []float64) float64 { 725 if len(p) != len(q) { 726 panic("stat: slice length mismatch") 727 } 728 var kl float64 729 for i, v := range p { 730 if v != 0 { // Entropy needs 0 * log(0) == 0. 731 kl += v * (math.Log(v) - math.Log(q[i])) 732 } 733 } 734 return kl 735 } 736 737 // LinearRegression computes the best-fit line 738 // y = alpha + beta*x 739 // to the data in x and y with the given weights. If origin is true, the 740 // regression is forced to pass through the origin. 741 // 742 // Specifically, LinearRegression computes the values of alpha and 743 // beta such that the total residual 744 // \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 745 // is minimized. If origin is true, then alpha is forced to be zero. 746 // 747 // The lengths of x and y must be equal. If weights is nil then all of the 748 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 749 func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { 750 if len(x) != len(y) { 751 panic("stat: slice length mismatch") 752 } 753 if weights != nil && len(weights) != len(x) { 754 panic("stat: slice length mismatch") 755 } 756 757 w := 1.0 758 if origin { 759 var x2Sum, xySum float64 760 for i, xi := range x { 761 if weights != nil { 762 w = weights[i] 763 } 764 yi := y[i] 765 xySum += w * xi * yi 766 x2Sum += w * xi * xi 767 } 768 beta = xySum / x2Sum 769 770 return 0, beta 771 } 772 773 xu, xv := MeanVariance(x, weights) 774 yu := Mean(y, weights) 775 cov := covarianceMeans(x, y, weights, xu, yu) 776 beta = cov / xv 777 alpha = yu - beta*xu 778 return alpha, beta 779 } 780 781 // RSquared returns the coefficient of determination defined as 782 // R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2 783 // for the line 784 // y = alpha + beta*x 785 // and the data in x and y with the given weights. 786 // 787 // The lengths of x and y must be equal. If weights is nil then all of the 788 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 789 func RSquared(x, y, weights []float64, alpha, beta float64) float64 { 790 if len(x) != len(y) { 791 panic("stat: slice length mismatch") 792 } 793 if weights != nil && len(weights) != len(x) { 794 panic("stat: slice length mismatch") 795 } 796 797 w := 1.0 798 yMean := Mean(y, weights) 799 var res, tot, d float64 800 for i, xi := range x { 801 if weights != nil { 802 w = weights[i] 803 } 804 yi := y[i] 805 fi := alpha + beta*xi 806 d = yi - fi 807 res += w * d * d 808 d = yi - yMean 809 tot += w * d * d 810 } 811 return 1 - res/tot 812 } 813 814 // RSquaredFrom returns the coefficient of determination defined as 815 // R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2 816 // and the data in estimates and values with the given weights. 817 // 818 // The lengths of estimates and values must be equal. If weights is nil then 819 // all of the weights are 1. If weights is not nil, then len(values) must 820 // equal len(weights). 821 func RSquaredFrom(estimates, values, weights []float64) float64 { 822 if len(estimates) != len(values) { 823 panic("stat: slice length mismatch") 824 } 825 if weights != nil && len(weights) != len(values) { 826 panic("stat: slice length mismatch") 827 } 828 829 w := 1.0 830 mean := Mean(values, weights) 831 var res, tot, d float64 832 for i, val := range values { 833 if weights != nil { 834 w = weights[i] 835 } 836 d = val - estimates[i] 837 res += w * d * d 838 d = val - mean 839 tot += w * d * d 840 } 841 return 1 - res/tot 842 } 843 844 // RNoughtSquared returns the coefficient of determination defined as 845 // R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2 846 // for the line 847 // y = beta*x 848 // and the data in x and y with the given weights. RNoughtSquared should 849 // only be used for best-fit lines regressed through the origin. 850 // 851 // The lengths of x and y must be equal. If weights is nil then all of the 852 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 853 func RNoughtSquared(x, y, weights []float64, beta float64) float64 { 854 if len(x) != len(y) { 855 panic("stat: slice length mismatch") 856 } 857 if weights != nil && len(weights) != len(x) { 858 panic("stat: slice length mismatch") 859 } 860 861 w := 1.0 862 var ssr, tot float64 863 for i, xi := range x { 864 if weights != nil { 865 w = weights[i] 866 } 867 fi := beta * xi 868 ssr += w * fi * fi 869 yi := y[i] 870 tot += w * yi * yi 871 } 872 return ssr / tot 873 } 874 875 // Mean computes the weighted mean of the data set. 876 // sum_i {w_i * x_i} / sum_i {w_i} 877 // If weights is nil then all of the weights are 1. If weights is not nil, then 878 // len(x) must equal len(weights). 879 func Mean(x, weights []float64) float64 { 880 if weights == nil { 881 return floats.Sum(x) / float64(len(x)) 882 } 883 if len(x) != len(weights) { 884 panic("stat: slice length mismatch") 885 } 886 var ( 887 sumValues float64 888 sumWeights float64 889 ) 890 for i, w := range weights { 891 sumValues += w * x[i] 892 sumWeights += w 893 } 894 return sumValues / sumWeights 895 } 896 897 // Mode returns the most common value in the dataset specified by x and the 898 // given weights. Strict float64 equality is used when comparing values, so users 899 // should take caution. If several values are the mode, any of them may be returned. 900 func Mode(x, weights []float64) (val float64, count float64) { 901 if weights != nil && len(x) != len(weights) { 902 panic("stat: slice length mismatch") 903 } 904 if len(x) == 0 { 905 return 0, 0 906 } 907 m := make(map[float64]float64) 908 if weights == nil { 909 for _, v := range x { 910 m[v]++ 911 } 912 } else { 913 for i, v := range x { 914 m[v] += weights[i] 915 } 916 } 917 var ( 918 maxCount float64 919 max float64 920 ) 921 for val, count := range m { 922 if count > maxCount { 923 maxCount = count 924 max = val 925 } 926 } 927 return max, maxCount 928 } 929 930 // BivariateMoment computes the weighted mixed moment between the samples x and y. 931 // E[(x - μ_x)^r*(y - μ_y)^s] 932 // No degrees of freedom correction is done. 933 // The lengths of x and y must be equal. If weights is nil then all of the 934 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 935 func BivariateMoment(r, s float64, x, y, weights []float64) float64 { 936 meanX := Mean(x, weights) 937 meanY := Mean(y, weights) 938 if len(x) != len(y) { 939 panic("stat: slice length mismatch") 940 } 941 if weights == nil { 942 var m float64 943 for i, vx := range x { 944 vy := y[i] 945 m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 946 } 947 return m / float64(len(x)) 948 } 949 if len(weights) != len(x) { 950 panic("stat: slice length mismatch") 951 } 952 var ( 953 m float64 954 sumWeights float64 955 ) 956 for i, vx := range x { 957 vy := y[i] 958 w := weights[i] 959 m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 960 sumWeights += w 961 } 962 return m / sumWeights 963 } 964 965 // Moment computes the weighted n^th moment of the samples, 966 // E[(x - μ)^N] 967 // No degrees of freedom correction is done. 968 // If weights is nil then all of the weights are 1. If weights is not nil, then 969 // len(x) must equal len(weights). 970 func Moment(moment float64, x, weights []float64) float64 { 971 // This also checks that x and weights have the same length. 972 mean := Mean(x, weights) 973 if weights == nil { 974 var m float64 975 for _, v := range x { 976 m += math.Pow(v-mean, moment) 977 } 978 return m / float64(len(x)) 979 } 980 var ( 981 m float64 982 sumWeights float64 983 ) 984 for i, v := range x { 985 w := weights[i] 986 m += w * math.Pow(v-mean, moment) 987 sumWeights += w 988 } 989 return m / sumWeights 990 } 991 992 // MomentAbout computes the weighted n^th weighted moment of the samples about 993 // the given mean \mu, 994 // E[(x - μ)^N] 995 // No degrees of freedom correction is done. 996 // If weights is nil then all of the weights are 1. If weights is not nil, then 997 // len(x) must equal len(weights). 998 func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 { 999 if weights == nil { 1000 var m float64 1001 for _, v := range x { 1002 m += math.Pow(v-mean, moment) 1003 } 1004 m /= float64(len(x)) 1005 return m 1006 } 1007 if len(weights) != len(x) { 1008 panic("stat: slice length mismatch") 1009 } 1010 var ( 1011 m float64 1012 sumWeights float64 1013 ) 1014 for i, v := range x { 1015 m += weights[i] * math.Pow(v-mean, moment) 1016 sumWeights += weights[i] 1017 } 1018 return m / sumWeights 1019 } 1020 1021 // Quantile returns the sample of x such that x is greater than or 1022 // equal to the fraction p of samples. The exact behavior is determined by the 1023 // CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically 1024 // the inverse of the CDF function, though it may not be the actual inverse 1025 // for all values p and CumulantKinds. 1026 // 1027 // The x data must be sorted in increasing order. If weights is nil then all 1028 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 1029 // 1030 // CumulantKind behaviors: 1031 // - Empirical: Returns the lowest value q for which q is greater than or equal 1032 // to the fraction p of samples 1033 // - LinInterp: Returns the linearly interpolated value 1034 func Quantile(p float64, c CumulantKind, x, weights []float64) float64 { 1035 if !(p >= 0 && p <= 1) { 1036 panic("stat: percentile out of bounds") 1037 } 1038 1039 if weights != nil && len(x) != len(weights) { 1040 panic("stat: slice length mismatch") 1041 } 1042 if floats.HasNaN(x) { 1043 return math.NaN() // This is needed because the algorithm breaks otherwise. 1044 } 1045 if !sort.Float64sAreSorted(x) { 1046 panic("x data are not sorted") 1047 } 1048 1049 var sumWeights float64 1050 if weights == nil { 1051 sumWeights = float64(len(x)) 1052 } else { 1053 sumWeights = floats.Sum(weights) 1054 } 1055 switch c { 1056 case Empirical: 1057 return empiricalQuantile(p, x, weights, sumWeights) 1058 case LinInterp: 1059 return linInterpQuantile(p, x, weights, sumWeights) 1060 default: 1061 panic("stat: bad cumulant kind") 1062 } 1063 } 1064 1065 func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1066 var cumsum float64 1067 fidx := p * sumWeights 1068 for i := range x { 1069 if weights == nil { 1070 cumsum++ 1071 } else { 1072 cumsum += weights[i] 1073 } 1074 if cumsum >= fidx { 1075 return x[i] 1076 } 1077 } 1078 panic("impossible") 1079 } 1080 1081 func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1082 var cumsum float64 1083 fidx := p * sumWeights 1084 for i := range x { 1085 if weights == nil { 1086 cumsum++ 1087 } else { 1088 cumsum += weights[i] 1089 } 1090 if cumsum >= fidx { 1091 if i == 0 { 1092 return x[0] 1093 } 1094 t := cumsum - fidx 1095 if weights != nil { 1096 t /= weights[i] 1097 } 1098 return t*x[i-1] + (1-t)*x[i] 1099 } 1100 } 1101 panic("impossible") 1102 } 1103 1104 // Skew computes the skewness of the sample data. 1105 // If weights is nil then all of the weights are 1. If weights is not nil, then 1106 // len(x) must equal len(weights). 1107 // When weights sum to 1 or less, a biased variance estimator should be used. 1108 func Skew(x, weights []float64) float64 { 1109 1110 mean, std := MeanStdDev(x, weights) 1111 if weights == nil { 1112 var s float64 1113 for _, v := range x { 1114 z := (v - mean) / std 1115 s += z * z * z 1116 } 1117 return s * skewCorrection(float64(len(x))) 1118 } 1119 var ( 1120 s float64 1121 sumWeights float64 1122 ) 1123 for i, v := range x { 1124 z := (v - mean) / std 1125 s += weights[i] * z * z * z 1126 sumWeights += weights[i] 1127 } 1128 return s * skewCorrection(sumWeights) 1129 } 1130 1131 // From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7 1132 func skewCorrection(n float64) float64 { 1133 return (n / (n - 1)) * (1 / (n - 2)) 1134 } 1135 1136 // SortWeighted rearranges the data in x along with their corresponding 1137 // weights so that the x data are sorted. The data is sorted in place. 1138 // Weights may be nil, but if weights is non-nil then it must have the same 1139 // length as x. 1140 func SortWeighted(x, weights []float64) { 1141 if weights == nil { 1142 sort.Float64s(x) 1143 return 1144 } 1145 if len(x) != len(weights) { 1146 panic("stat: slice length mismatch") 1147 } 1148 sort.Sort(weightSorter{ 1149 x: x, 1150 w: weights, 1151 }) 1152 } 1153 1154 type weightSorter struct { 1155 x []float64 1156 w []float64 1157 } 1158 1159 func (w weightSorter) Len() int { return len(w.x) } 1160 func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] } 1161 func (w weightSorter) Swap(i, j int) { 1162 w.x[i], w.x[j] = w.x[j], w.x[i] 1163 w.w[i], w.w[j] = w.w[j], w.w[i] 1164 } 1165 1166 // SortWeightedLabeled rearranges the data in x along with their 1167 // corresponding weights and boolean labels so that the x data are sorted. 1168 // The data is sorted in place. Weights and labels may be nil, if either 1169 // is non-nil it must have the same length as x. 1170 func SortWeightedLabeled(x []float64, labels []bool, weights []float64) { 1171 if labels == nil { 1172 SortWeighted(x, weights) 1173 return 1174 } 1175 if weights == nil { 1176 if len(x) != len(labels) { 1177 panic("stat: slice length mismatch") 1178 } 1179 sort.Sort(labelSorter{ 1180 x: x, 1181 l: labels, 1182 }) 1183 return 1184 } 1185 if len(x) != len(labels) || len(x) != len(weights) { 1186 panic("stat: slice length mismatch") 1187 } 1188 sort.Sort(weightLabelSorter{ 1189 x: x, 1190 l: labels, 1191 w: weights, 1192 }) 1193 } 1194 1195 type labelSorter struct { 1196 x []float64 1197 l []bool 1198 } 1199 1200 func (a labelSorter) Len() int { return len(a.x) } 1201 func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1202 func (a labelSorter) Swap(i, j int) { 1203 a.x[i], a.x[j] = a.x[j], a.x[i] 1204 a.l[i], a.l[j] = a.l[j], a.l[i] 1205 } 1206 1207 type weightLabelSorter struct { 1208 x []float64 1209 l []bool 1210 w []float64 1211 } 1212 1213 func (a weightLabelSorter) Len() int { return len(a.x) } 1214 func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1215 func (a weightLabelSorter) Swap(i, j int) { 1216 a.x[i], a.x[j] = a.x[j], a.x[i] 1217 a.l[i], a.l[j] = a.l[j], a.l[i] 1218 a.w[i], a.w[j] = a.w[j], a.w[i] 1219 } 1220 1221 // StdDev returns the sample standard deviation. 1222 func StdDev(x, weights []float64) float64 { 1223 _, std := MeanStdDev(x, weights) 1224 return std 1225 } 1226 1227 // MeanStdDev returns the sample mean and unbiased standard deviation 1228 // When weights sum to 1 or less, a biased variance estimator should be used. 1229 func MeanStdDev(x, weights []float64) (mean, std float64) { 1230 mean, variance := MeanVariance(x, weights) 1231 return mean, math.Sqrt(variance) 1232 } 1233 1234 // StdErr returns the standard error in the mean with the given values. 1235 func StdErr(std, sampleSize float64) float64 { 1236 return std / math.Sqrt(sampleSize) 1237 } 1238 1239 // StdScore returns the standard score (a.k.a. z-score, z-value) for the value x 1240 // with the given mean and standard deviation, i.e. 1241 // (x - mean) / std 1242 func StdScore(x, mean, std float64) float64 { 1243 return (x - mean) / std 1244 } 1245 1246 // Variance computes the unbiased weighted sample variance: 1247 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1248 // If weights is nil then all of the weights are 1. If weights is not nil, then 1249 // len(x) must equal len(weights). 1250 // When weights sum to 1 or less, a biased variance estimator should be used. 1251 func Variance(x, weights []float64) float64 { 1252 _, variance := MeanVariance(x, weights) 1253 return variance 1254 } 1255 1256 // MeanVariance computes the sample mean and unbiased variance, where the mean and variance are 1257 // \sum_i w_i * x_i / (sum_i w_i) 1258 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1259 // respectively. 1260 // If weights is nil then all of the weights are 1. If weights is not nil, then 1261 // len(x) must equal len(weights). 1262 // When weights sum to 1 or less, a biased variance estimator should be used. 1263 func MeanVariance(x, weights []float64) (mean, variance float64) { 1264 var ( 1265 unnormalisedVariance float64 1266 sumWeights float64 1267 ) 1268 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1269 return mean, unnormalisedVariance / (sumWeights - 1) 1270 } 1271 1272 // PopMeanVariance computes the sample mean and biased variance (also known as 1273 // "population variance"), where the mean and variance are 1274 // \sum_i w_i * x_i / (sum_i w_i) 1275 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1276 // respectively. 1277 // If weights is nil then all of the weights are 1. If weights is not nil, then 1278 // len(x) must equal len(weights). 1279 func PopMeanVariance(x, weights []float64) (mean, variance float64) { 1280 var ( 1281 unnormalisedVariance float64 1282 sumWeights float64 1283 ) 1284 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1285 return mean, unnormalisedVariance / sumWeights 1286 } 1287 1288 // PopMeanStdDev returns the sample mean and biased standard deviation 1289 // (also known as "population standard deviation"). 1290 func PopMeanStdDev(x, weights []float64) (mean, std float64) { 1291 mean, variance := PopMeanVariance(x, weights) 1292 return mean, math.Sqrt(variance) 1293 } 1294 1295 // PopStdDev returns the population standard deviation, i.e., a square root 1296 // of the biased variance estimate. 1297 func PopStdDev(x, weights []float64) float64 { 1298 _, stDev := PopMeanStdDev(x, weights) 1299 return stDev 1300 } 1301 1302 // PopVariance computes the unbiased weighted sample variance: 1303 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1304 // If weights is nil then all of the weights are 1. If weights is not nil, then 1305 // len(x) must equal len(weights). 1306 func PopVariance(x, weights []float64) float64 { 1307 _, variance := PopMeanVariance(x, weights) 1308 return variance 1309 } 1310 1311 func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) { 1312 // This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing 1313 // the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub, 1314 // and Randall J. LeVeque. 1315 1316 // Note that this will panic if the slice lengths do not match. 1317 mean = Mean(x, weights) 1318 var ( 1319 ss float64 1320 compensation float64 1321 ) 1322 if weights == nil { 1323 for _, v := range x { 1324 d := v - mean 1325 ss += d * d 1326 compensation += d 1327 } 1328 unnormalisedVariance = (ss - compensation*compensation/float64(len(x))) 1329 return mean, unnormalisedVariance, float64(len(x)) 1330 } 1331 1332 for i, v := range x { 1333 w := weights[i] 1334 d := v - mean 1335 wd := w * d 1336 ss += wd * d 1337 compensation += wd 1338 sumWeights += w 1339 } 1340 unnormalisedVariance = (ss - compensation*compensation/sumWeights) 1341 return mean, unnormalisedVariance, sumWeights 1342 }