github.com/gopherd/gonum@v0.0.4/stat/stat.go (about) 1 // Copyright ©2014 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package stat 6 7 import ( 8 "math" 9 "sort" 10 11 "github.com/gopherd/gonum/floats" 12 ) 13 14 // CumulantKind specifies the behavior for calculating the empirical CDF or Quantile 15 type CumulantKind int 16 17 // List of supported CumulantKind values for the Quantile function. 18 // Constant values should match the R nomenclature. See 19 // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population 20 const ( 21 // Empirical treats the distribution as the actual empirical distribution. 22 Empirical CumulantKind = 1 23 // LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation. 24 LinInterp CumulantKind = 4 25 ) 26 27 // bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by: 28 // \sum_i \sqrt{p_i q_i} 29 // 30 // It is assumed that p and q have equal length. 31 func bhattacharyyaCoeff(p, q []float64) float64 { 32 var bc float64 33 for i, a := range p { 34 bc += math.Sqrt(a * q[i]) 35 } 36 return bc 37 } 38 39 // Bhattacharyya computes the distance between the probability distributions p and q given by: 40 // -\ln ( \sum_i \sqrt{p_i q_i} ) 41 // 42 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 43 func Bhattacharyya(p, q []float64) float64 { 44 if len(p) != len(q) { 45 panic("stat: slice length mismatch") 46 } 47 bc := bhattacharyyaCoeff(p, q) 48 return -math.Log(bc) 49 } 50 51 // CDF returns the empirical cumulative distribution function value of x, that is 52 // the fraction of the samples less than or equal to q. The 53 // exact behavior is determined by the CumulantKind. CDF is theoretically 54 // the inverse of the Quantile function, though it may not be the actual inverse 55 // for all values q and CumulantKinds. 56 // 57 // The x data must be sorted in increasing order. If weights is nil then all 58 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 59 // CDF will panic if the length of x is zero. 60 // 61 // CumulantKind behaviors: 62 // - Empirical: Returns the lowest fraction for which q is greater than or equal 63 // to that fraction of samples 64 func CDF(q float64, c CumulantKind, x, weights []float64) float64 { 65 if weights != nil && len(x) != len(weights) { 66 panic("stat: slice length mismatch") 67 } 68 if floats.HasNaN(x) { 69 return math.NaN() 70 } 71 if len(x) == 0 { 72 panic("stat: zero length slice") 73 } 74 if !sort.Float64sAreSorted(x) { 75 panic("x data are not sorted") 76 } 77 78 if q < x[0] { 79 return 0 80 } 81 if q >= x[len(x)-1] { 82 return 1 83 } 84 85 var sumWeights float64 86 if weights == nil { 87 sumWeights = float64(len(x)) 88 } else { 89 sumWeights = floats.Sum(weights) 90 } 91 92 // Calculate the index 93 switch c { 94 case Empirical: 95 // Find the smallest value that is greater than that percent of the samples 96 var w float64 97 for i, v := range x { 98 if v > q { 99 return w / sumWeights 100 } 101 if weights == nil { 102 w++ 103 } else { 104 w += weights[i] 105 } 106 } 107 panic("impossible") 108 default: 109 panic("stat: bad cumulant kind") 110 } 111 } 112 113 // ChiSquare computes the chi-square distance between the observed frequencies 'obs' and 114 // expected frequencies 'exp' given by: 115 // \sum_i (obs_i-exp_i)^2 / exp_i 116 // 117 // The lengths of obs and exp must be equal. 118 func ChiSquare(obs, exp []float64) float64 { 119 if len(obs) != len(exp) { 120 panic("stat: slice length mismatch") 121 } 122 var result float64 123 for i, a := range obs { 124 b := exp[i] 125 if a == 0 && b == 0 { 126 continue 127 } 128 result += (a - b) * (a - b) / b 129 } 130 return result 131 } 132 133 // CircularMean returns the circular mean of the dataset. 134 // atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i)) 135 // If weights is nil then all of the weights are 1. If weights is not nil, then 136 // len(x) must equal len(weights). 137 func CircularMean(x, weights []float64) float64 { 138 if weights != nil && len(x) != len(weights) { 139 panic("stat: slice length mismatch") 140 } 141 142 var aX, aY float64 143 if weights != nil { 144 for i, v := range x { 145 aX += weights[i] * math.Cos(v) 146 aY += weights[i] * math.Sin(v) 147 } 148 } else { 149 for _, v := range x { 150 aX += math.Cos(v) 151 aY += math.Sin(v) 152 } 153 } 154 155 return math.Atan2(aY, aX) 156 } 157 158 // Correlation returns the weighted correlation between the samples of x and y 159 // with the given means. 160 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY) 161 // The lengths of x and y must be equal. If weights is nil then all of the 162 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 163 func Correlation(x, y, weights []float64) float64 { 164 // This is a two-pass corrected implementation. It is an adaptation of the 165 // algorithm used in the MeanVariance function, which applies a correction 166 // to the typical two pass approach. 167 168 if len(x) != len(y) { 169 panic("stat: slice length mismatch") 170 } 171 xu := Mean(x, weights) 172 yu := Mean(y, weights) 173 var ( 174 sxx float64 175 syy float64 176 sxy float64 177 xcompensation float64 178 ycompensation float64 179 ) 180 if weights == nil { 181 for i, xv := range x { 182 yv := y[i] 183 xd := xv - xu 184 yd := yv - yu 185 sxx += xd * xd 186 syy += yd * yd 187 sxy += xd * yd 188 xcompensation += xd 189 ycompensation += yd 190 } 191 // xcompensation and ycompensation are from Chan, et. al. 192 // referenced in the MeanVariance function. They are analogous 193 // to the second term in (1.7) in that paper. 194 sxx -= xcompensation * xcompensation / float64(len(x)) 195 syy -= ycompensation * ycompensation / float64(len(x)) 196 197 return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy) 198 199 } 200 201 var sumWeights float64 202 for i, xv := range x { 203 w := weights[i] 204 yv := y[i] 205 xd := xv - xu 206 wxd := w * xd 207 yd := yv - yu 208 wyd := w * yd 209 sxx += wxd * xd 210 syy += wyd * yd 211 sxy += wxd * yd 212 xcompensation += wxd 213 ycompensation += wyd 214 sumWeights += w 215 } 216 // xcompensation and ycompensation are from Chan, et. al. 217 // referenced in the MeanVariance function. They are analogous 218 // to the second term in (1.7) in that paper, except they use 219 // the sumWeights instead of the sample count. 220 sxx -= xcompensation * xcompensation / sumWeights 221 syy -= ycompensation * ycompensation / sumWeights 222 223 return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy) 224 } 225 226 // Kendall returns the weighted Tau-a Kendall correlation between the 227 // samples of x and y. The Kendall correlation measures the quantity of 228 // concordant and discordant pairs of numbers. If weights are specified then 229 // each pair is weighted by weights[i] * weights[j] and the final sum is 230 // normalized to stay between -1 and 1. 231 // The lengths of x and y must be equal. If weights is nil then all of the 232 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 233 func Kendall(x, y, weights []float64) float64 { 234 if len(x) != len(y) { 235 panic("stat: slice length mismatch") 236 } 237 238 var ( 239 cc float64 // number of concordant pairs 240 dc float64 // number of discordant pairs 241 n = len(x) 242 ) 243 244 if weights == nil { 245 for i := 0; i < n; i++ { 246 for j := i; j < n; j++ { 247 if i == j { 248 continue 249 } 250 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 251 cc++ 252 } else { 253 dc++ 254 } 255 } 256 } 257 return (cc - dc) / float64(n*(n-1)/2) 258 } 259 260 var sumWeights float64 261 262 for i := 0; i < n; i++ { 263 for j := i; j < n; j++ { 264 if i == j { 265 continue 266 } 267 weight := weights[i] * weights[j] 268 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 269 cc += weight 270 } else { 271 dc += weight 272 } 273 sumWeights += weight 274 } 275 } 276 return float64(cc-dc) / sumWeights 277 } 278 279 // Covariance returns the weighted covariance between the samples of x and y. 280 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1) 281 // The lengths of x and y must be equal. If weights is nil then all of the 282 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 283 func Covariance(x, y, weights []float64) float64 { 284 // This is a two-pass corrected implementation. It is an adaptation of the 285 // algorithm used in the MeanVariance function, which applies a correction 286 // to the typical two pass approach. 287 288 if len(x) != len(y) { 289 panic("stat: slice length mismatch") 290 } 291 xu := Mean(x, weights) 292 yu := Mean(y, weights) 293 return covarianceMeans(x, y, weights, xu, yu) 294 } 295 296 // covarianceMeans returns the weighted covariance between x and y with the mean 297 // of x and y already specified. See the documentation of Covariance for more 298 // information. 299 func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 { 300 var ( 301 ss float64 302 xcompensation float64 303 ycompensation float64 304 ) 305 if weights == nil { 306 for i, xv := range x { 307 yv := y[i] 308 xd := xv - xu 309 yd := yv - yu 310 ss += xd * yd 311 xcompensation += xd 312 ycompensation += yd 313 } 314 // xcompensation and ycompensation are from Chan, et. al. 315 // referenced in the MeanVariance function. They are analogous 316 // to the second term in (1.7) in that paper. 317 return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1) 318 } 319 320 var sumWeights float64 321 322 for i, xv := range x { 323 w := weights[i] 324 yv := y[i] 325 wxd := w * (xv - xu) 326 yd := (yv - yu) 327 ss += wxd * yd 328 xcompensation += wxd 329 ycompensation += w * yd 330 sumWeights += w 331 } 332 // xcompensation and ycompensation are from Chan, et. al. 333 // referenced in the MeanVariance function. They are analogous 334 // to the second term in (1.7) in that paper, except they use 335 // the sumWeights instead of the sample count. 336 return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1) 337 } 338 339 // CrossEntropy computes the cross-entropy between the two distributions specified 340 // in p and q. 341 func CrossEntropy(p, q []float64) float64 { 342 if len(p) != len(q) { 343 panic("stat: slice length mismatch") 344 } 345 var ce float64 346 for i, v := range p { 347 if v != 0 { 348 ce -= v * math.Log(q[i]) 349 } 350 } 351 return ce 352 } 353 354 // Entropy computes the Shannon entropy of a distribution or the distance between 355 // two distributions. The natural logarithm is used. 356 // - sum_i (p_i * log_e(p_i)) 357 func Entropy(p []float64) float64 { 358 var e float64 359 for _, v := range p { 360 if v != 0 { // Entropy needs 0 * log(0) == 0. 361 e -= v * math.Log(v) 362 } 363 } 364 return e 365 } 366 367 // ExKurtosis returns the population excess kurtosis of the sample. 368 // The kurtosis is defined by the 4th moment of the mean divided by the squared 369 // variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of 370 // the normal distribution is zero. 371 // If weights is nil then all of the weights are 1. If weights is not nil, then 372 // len(x) must equal len(weights). 373 func ExKurtosis(x, weights []float64) float64 { 374 mean, std := MeanStdDev(x, weights) 375 if weights == nil { 376 var e float64 377 for _, v := range x { 378 z := (v - mean) / std 379 e += z * z * z * z 380 } 381 mul, offset := kurtosisCorrection(float64(len(x))) 382 return e*mul - offset 383 } 384 385 var ( 386 e float64 387 sumWeights float64 388 ) 389 for i, v := range x { 390 z := (v - mean) / std 391 e += weights[i] * z * z * z * z 392 sumWeights += weights[i] 393 } 394 mul, offset := kurtosisCorrection(sumWeights) 395 return e*mul - offset 396 } 397 398 // n is the number of samples 399 // see https://en.wikipedia.org/wiki/Kurtosis 400 func kurtosisCorrection(n float64) (mul, offset float64) { 401 return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3)) 402 } 403 404 // GeometricMean returns the weighted geometric mean of the dataset 405 // \prod_i {x_i ^ w_i} 406 // This only applies with positive x and positive weights. If weights is nil 407 // then all of the weights are 1. If weights is not nil, then len(x) must equal 408 // len(weights). 409 func GeometricMean(x, weights []float64) float64 { 410 if weights == nil { 411 var s float64 412 for _, v := range x { 413 s += math.Log(v) 414 } 415 s /= float64(len(x)) 416 return math.Exp(s) 417 } 418 if len(x) != len(weights) { 419 panic("stat: slice length mismatch") 420 } 421 var ( 422 s float64 423 sumWeights float64 424 ) 425 for i, v := range x { 426 s += weights[i] * math.Log(v) 427 sumWeights += weights[i] 428 } 429 s /= sumWeights 430 return math.Exp(s) 431 } 432 433 // HarmonicMean returns the weighted harmonic mean of the dataset 434 // \sum_i {w_i} / ( sum_i {w_i / x_i} ) 435 // This only applies with positive x and positive weights. 436 // If weights is nil then all of the weights are 1. If weights is not nil, then 437 // len(x) must equal len(weights). 438 func HarmonicMean(x, weights []float64) float64 { 439 if weights != nil && len(x) != len(weights) { 440 panic("stat: slice length mismatch") 441 } 442 // TODO(btracey): Fix this to make it more efficient and avoid allocation. 443 444 // This can be numerically unstable (for example if x is very small). 445 // W = \sum_i {w_i} 446 // hm = exp(log(W) - log(\sum_i w_i / x_i)) 447 448 logs := make([]float64, len(x)) 449 var W float64 450 for i := range x { 451 if weights == nil { 452 logs[i] = -math.Log(x[i]) 453 W++ 454 continue 455 } 456 logs[i] = math.Log(weights[i]) - math.Log(x[i]) 457 W += weights[i] 458 } 459 460 // Sum all of the logs 461 v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}). 462 return math.Exp(math.Log(W) - v) 463 } 464 465 // Hellinger computes the distance between the probability distributions p and q given by: 466 // \sqrt{ 1 - \sum_i \sqrt{p_i q_i} } 467 // 468 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 469 func Hellinger(p, q []float64) float64 { 470 if len(p) != len(q) { 471 panic("stat: slice length mismatch") 472 } 473 bc := bhattacharyyaCoeff(p, q) 474 return math.Sqrt(1 - bc) 475 } 476 477 // Histogram sums up the weighted number of data points in each bin. 478 // The weight of data point x[i] will be placed into count[j] if 479 // dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist 480 // with bin creation. 481 // 482 // The following conditions on the inputs apply: 483 // - The count variable must either be nil or have length of one less than dividers. 484 // - The values in dividers must be sorted (use the sort package). 485 // - The x values must be sorted. 486 // - If weights is nil then all of the weights are 1. 487 // - If weights is not nil, then len(x) must equal len(weights). 488 func Histogram(count, dividers, x, weights []float64) []float64 { 489 if weights != nil && len(x) != len(weights) { 490 panic("stat: slice length mismatch") 491 } 492 if count == nil { 493 count = make([]float64, len(dividers)-1) 494 } 495 if len(dividers) < 2 { 496 panic("histogram: fewer than two dividers") 497 } 498 if len(count) != len(dividers)-1 { 499 panic("histogram: bin count mismatch") 500 } 501 if !sort.Float64sAreSorted(dividers) { 502 panic("histogram: dividers are not sorted") 503 } 504 if !sort.Float64sAreSorted(x) { 505 panic("histogram: x data are not sorted") 506 } 507 for i := range count { 508 count[i] = 0 509 } 510 if len(x) == 0 { 511 return count 512 } 513 if x[0] < dividers[0] { 514 panic("histogram: minimum x value is less than lowest divider") 515 } 516 if dividers[len(dividers)-1] <= x[len(x)-1] { 517 panic("histogram: maximum x value is greater than or equal to highest divider") 518 } 519 520 idx := 0 521 comp := dividers[idx+1] 522 if weights == nil { 523 for _, v := range x { 524 if v < comp { 525 // Still in the current bucket. 526 count[idx]++ 527 continue 528 } 529 // Find the next divider where v is less than the divider. 530 for j := idx + 1; j < len(dividers); j++ { 531 if v < dividers[j+1] { 532 idx = j 533 comp = dividers[j+1] 534 break 535 } 536 } 537 count[idx]++ 538 } 539 return count 540 } 541 542 for i, v := range x { 543 if v < comp { 544 // Still in the current bucket. 545 count[idx] += weights[i] 546 continue 547 } 548 // Need to find the next divider where v is less than the divider. 549 for j := idx + 1; j < len(count); j++ { 550 if v < dividers[j+1] { 551 idx = j 552 comp = dividers[j+1] 553 break 554 } 555 } 556 count[idx] += weights[i] 557 } 558 return count 559 } 560 561 // JensenShannon computes the JensenShannon divergence between the distributions 562 // p and q. The Jensen-Shannon divergence is defined as 563 // m = 0.5 * (p + q) 564 // JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) ) 565 // Unlike Kullback-Liebler, the Jensen-Shannon distance is symmetric. The value 566 // is between 0 and ln(2). 567 func JensenShannon(p, q []float64) float64 { 568 if len(p) != len(q) { 569 panic("stat: slice length mismatch") 570 } 571 var js float64 572 for i, v := range p { 573 qi := q[i] 574 m := 0.5 * (v + qi) 575 if v != 0 { 576 // add kl from p to m 577 js += 0.5 * v * (math.Log(v) - math.Log(m)) 578 } 579 if qi != 0 { 580 // add kl from q to m 581 js += 0.5 * qi * (math.Log(qi) - math.Log(m)) 582 } 583 } 584 return js 585 } 586 587 // KolmogorovSmirnov computes the largest distance between two empirical CDFs. 588 // Each dataset x and y consists of sample locations and counts, xWeights and 589 // yWeights, respectively. 590 // 591 // x and y may have different lengths, though len(x) must equal len(xWeights), and 592 // len(y) must equal len(yWeights). Both x and y must be sorted. 593 // 594 // Special cases are: 595 // = 0 if len(x) == len(y) == 0 596 // = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0 597 func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 { 598 if xWeights != nil && len(x) != len(xWeights) { 599 panic("stat: slice length mismatch") 600 } 601 if yWeights != nil && len(y) != len(yWeights) { 602 panic("stat: slice length mismatch") 603 } 604 if len(x) == 0 || len(y) == 0 { 605 if len(x) == 0 && len(y) == 0 { 606 return 0 607 } 608 return 1 609 } 610 611 if floats.HasNaN(x) { 612 return math.NaN() 613 } 614 if floats.HasNaN(y) { 615 return math.NaN() 616 } 617 618 if !sort.Float64sAreSorted(x) { 619 panic("x data are not sorted") 620 } 621 if !sort.Float64sAreSorted(y) { 622 panic("y data are not sorted") 623 } 624 625 xWeightsNil := xWeights == nil 626 yWeightsNil := yWeights == nil 627 628 var ( 629 maxDist float64 630 xSum, ySum float64 631 xCdf, yCdf float64 632 xIdx, yIdx int 633 ) 634 635 if xWeightsNil { 636 xSum = float64(len(x)) 637 } else { 638 xSum = floats.Sum(xWeights) 639 } 640 641 if yWeightsNil { 642 ySum = float64(len(y)) 643 } else { 644 ySum = floats.Sum(yWeights) 645 } 646 647 xVal := x[0] 648 yVal := y[0] 649 650 // Algorithm description: 651 // The goal is to find the maximum difference in the empirical CDFs for the 652 // two datasets. The CDFs are piecewise-constant, and thus the distance 653 // between the CDFs will only change at the values themselves. 654 // 655 // To find the maximum distance, step through the data in ascending order 656 // of value between the two datasets. At each step, compute the empirical CDF 657 // and compare the local distance with the maximum distance. 658 // Due to some corner cases, equal data entries must be tallied simultaneously. 659 for { 660 switch { 661 case xVal < yVal: 662 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 663 case yVal < xVal: 664 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 665 case xVal == yVal: 666 newX := x[xIdx] 667 newY := y[yIdx] 668 if newX < newY { 669 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 670 } else if newY < newX { 671 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 672 } else { 673 // Update them both, they'll be equal next time and the right 674 // thing will happen. 675 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 676 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 677 } 678 default: 679 panic("unreachable") 680 } 681 682 dist := math.Abs(xCdf - yCdf) 683 if dist > maxDist { 684 maxDist = dist 685 } 686 687 // Both xCdf and yCdf will equal 1 at the end, so if we have reached the 688 // end of either sample list, the distance is as large as it can be. 689 if xIdx == len(x) || yIdx == len(y) { 690 return maxDist 691 } 692 } 693 } 694 695 // updateKS gets the next data point from one of the set. In doing so, it combines 696 // the weight of all the data points of equal value. Upon return, val is the new 697 // value of the data set, newCdf is the total combined CDF up until this point, 698 // and newIdx is the index of the next location in that sample to examine. 699 func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) { 700 // Sum up all the weights of consecutive values that are equal. 701 if isNil { 702 newCdf = cdf + 1/sum 703 } else { 704 newCdf = cdf + weights[idx]/sum 705 } 706 newIdx = idx + 1 707 for { 708 if newIdx == len(values) { 709 return values[newIdx-1], newCdf, newIdx 710 } 711 if values[newIdx-1] != values[newIdx] { 712 return values[newIdx], newCdf, newIdx 713 } 714 if isNil { 715 newCdf += 1 / sum 716 } else { 717 newCdf += weights[newIdx] / sum 718 } 719 newIdx++ 720 } 721 } 722 723 // KullbackLeibler computes the Kullback-Leibler distance between the 724 // distributions p and q. The natural logarithm is used. 725 // sum_i(p_i * log(p_i / q_i)) 726 // Note that the Kullback-Leibler distance is not symmetric; 727 // KullbackLeibler(p,q) != KullbackLeibler(q,p) 728 func KullbackLeibler(p, q []float64) float64 { 729 if len(p) != len(q) { 730 panic("stat: slice length mismatch") 731 } 732 var kl float64 733 for i, v := range p { 734 if v != 0 { // Entropy needs 0 * log(0) == 0. 735 kl += v * (math.Log(v) - math.Log(q[i])) 736 } 737 } 738 return kl 739 } 740 741 // LinearRegression computes the best-fit line 742 // y = alpha + beta*x 743 // to the data in x and y with the given weights. If origin is true, the 744 // regression is forced to pass through the origin. 745 // 746 // Specifically, LinearRegression computes the values of alpha and 747 // beta such that the total residual 748 // \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 749 // is minimized. If origin is true, then alpha is forced to be zero. 750 // 751 // The lengths of x and y must be equal. If weights is nil then all of the 752 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 753 func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { 754 if len(x) != len(y) { 755 panic("stat: slice length mismatch") 756 } 757 if weights != nil && len(weights) != len(x) { 758 panic("stat: slice length mismatch") 759 } 760 761 w := 1.0 762 if origin { 763 var x2Sum, xySum float64 764 for i, xi := range x { 765 if weights != nil { 766 w = weights[i] 767 } 768 yi := y[i] 769 xySum += w * xi * yi 770 x2Sum += w * xi * xi 771 } 772 beta = xySum / x2Sum 773 774 return 0, beta 775 } 776 777 xu, xv := MeanVariance(x, weights) 778 yu := Mean(y, weights) 779 cov := covarianceMeans(x, y, weights, xu, yu) 780 beta = cov / xv 781 alpha = yu - beta*xu 782 return alpha, beta 783 } 784 785 // RSquared returns the coefficient of determination defined as 786 // R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2 787 // for the line 788 // y = alpha + beta*x 789 // and the data in x and y with the given weights. 790 // 791 // The lengths of x and y must be equal. If weights is nil then all of the 792 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 793 func RSquared(x, y, weights []float64, alpha, beta float64) float64 { 794 if len(x) != len(y) { 795 panic("stat: slice length mismatch") 796 } 797 if weights != nil && len(weights) != len(x) { 798 panic("stat: slice length mismatch") 799 } 800 801 w := 1.0 802 yMean := Mean(y, weights) 803 var res, tot, d float64 804 for i, xi := range x { 805 if weights != nil { 806 w = weights[i] 807 } 808 yi := y[i] 809 fi := alpha + beta*xi 810 d = yi - fi 811 res += w * d * d 812 d = yi - yMean 813 tot += w * d * d 814 } 815 return 1 - res/tot 816 } 817 818 // RSquaredFrom returns the coefficient of determination defined as 819 // R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2 820 // and the data in estimates and values with the given weights. 821 // 822 // The lengths of estimates and values must be equal. If weights is nil then 823 // all of the weights are 1. If weights is not nil, then len(values) must 824 // equal len(weights). 825 func RSquaredFrom(estimates, values, weights []float64) float64 { 826 if len(estimates) != len(values) { 827 panic("stat: slice length mismatch") 828 } 829 if weights != nil && len(weights) != len(values) { 830 panic("stat: slice length mismatch") 831 } 832 833 w := 1.0 834 mean := Mean(values, weights) 835 var res, tot, d float64 836 for i, val := range values { 837 if weights != nil { 838 w = weights[i] 839 } 840 d = val - estimates[i] 841 res += w * d * d 842 d = val - mean 843 tot += w * d * d 844 } 845 return 1 - res/tot 846 } 847 848 // RNoughtSquared returns the coefficient of determination defined as 849 // R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2 850 // for the line 851 // y = beta*x 852 // and the data in x and y with the given weights. RNoughtSquared should 853 // only be used for best-fit lines regressed through the origin. 854 // 855 // The lengths of x and y must be equal. If weights is nil then all of the 856 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 857 func RNoughtSquared(x, y, weights []float64, beta float64) float64 { 858 if len(x) != len(y) { 859 panic("stat: slice length mismatch") 860 } 861 if weights != nil && len(weights) != len(x) { 862 panic("stat: slice length mismatch") 863 } 864 865 w := 1.0 866 var ssr, tot float64 867 for i, xi := range x { 868 if weights != nil { 869 w = weights[i] 870 } 871 fi := beta * xi 872 ssr += w * fi * fi 873 yi := y[i] 874 tot += w * yi * yi 875 } 876 return ssr / tot 877 } 878 879 // Mean computes the weighted mean of the data set. 880 // sum_i {w_i * x_i} / sum_i {w_i} 881 // If weights is nil then all of the weights are 1. If weights is not nil, then 882 // len(x) must equal len(weights). 883 func Mean(x, weights []float64) float64 { 884 if weights == nil { 885 return floats.Sum(x) / float64(len(x)) 886 } 887 if len(x) != len(weights) { 888 panic("stat: slice length mismatch") 889 } 890 var ( 891 sumValues float64 892 sumWeights float64 893 ) 894 for i, w := range weights { 895 sumValues += w * x[i] 896 sumWeights += w 897 } 898 return sumValues / sumWeights 899 } 900 901 // Mode returns the most common value in the dataset specified by x and the 902 // given weights. Strict float64 equality is used when comparing values, so users 903 // should take caution. If several values are the mode, any of them may be returned. 904 func Mode(x, weights []float64) (val float64, count float64) { 905 if weights != nil && len(x) != len(weights) { 906 panic("stat: slice length mismatch") 907 } 908 if len(x) == 0 { 909 return 0, 0 910 } 911 m := make(map[float64]float64) 912 if weights == nil { 913 for _, v := range x { 914 m[v]++ 915 } 916 } else { 917 for i, v := range x { 918 m[v] += weights[i] 919 } 920 } 921 var ( 922 maxCount float64 923 max float64 924 ) 925 for val, count := range m { 926 if count > maxCount { 927 maxCount = count 928 max = val 929 } 930 } 931 return max, maxCount 932 } 933 934 // BivariateMoment computes the weighted mixed moment between the samples x and y. 935 // E[(x - μ_x)^r*(y - μ_y)^s] 936 // No degrees of freedom correction is done. 937 // The lengths of x and y must be equal. If weights is nil then all of the 938 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 939 func BivariateMoment(r, s float64, x, y, weights []float64) float64 { 940 meanX := Mean(x, weights) 941 meanY := Mean(y, weights) 942 if len(x) != len(y) { 943 panic("stat: slice length mismatch") 944 } 945 if weights == nil { 946 var m float64 947 for i, vx := range x { 948 vy := y[i] 949 m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 950 } 951 return m / float64(len(x)) 952 } 953 if len(weights) != len(x) { 954 panic("stat: slice length mismatch") 955 } 956 var ( 957 m float64 958 sumWeights float64 959 ) 960 for i, vx := range x { 961 vy := y[i] 962 w := weights[i] 963 m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 964 sumWeights += w 965 } 966 return m / sumWeights 967 } 968 969 // Moment computes the weighted n^th moment of the samples, 970 // E[(x - μ)^N] 971 // No degrees of freedom correction is done. 972 // If weights is nil then all of the weights are 1. If weights is not nil, then 973 // len(x) must equal len(weights). 974 func Moment(moment float64, x, weights []float64) float64 { 975 // This also checks that x and weights have the same length. 976 mean := Mean(x, weights) 977 if weights == nil { 978 var m float64 979 for _, v := range x { 980 m += math.Pow(v-mean, moment) 981 } 982 return m / float64(len(x)) 983 } 984 var ( 985 m float64 986 sumWeights float64 987 ) 988 for i, v := range x { 989 w := weights[i] 990 m += w * math.Pow(v-mean, moment) 991 sumWeights += w 992 } 993 return m / sumWeights 994 } 995 996 // MomentAbout computes the weighted n^th weighted moment of the samples about 997 // the given mean \mu, 998 // E[(x - μ)^N] 999 // No degrees of freedom correction is done. 1000 // If weights is nil then all of the weights are 1. If weights is not nil, then 1001 // len(x) must equal len(weights). 1002 func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 { 1003 if weights == nil { 1004 var m float64 1005 for _, v := range x { 1006 m += math.Pow(v-mean, moment) 1007 } 1008 m /= float64(len(x)) 1009 return m 1010 } 1011 if len(weights) != len(x) { 1012 panic("stat: slice length mismatch") 1013 } 1014 var ( 1015 m float64 1016 sumWeights float64 1017 ) 1018 for i, v := range x { 1019 m += weights[i] * math.Pow(v-mean, moment) 1020 sumWeights += weights[i] 1021 } 1022 return m / sumWeights 1023 } 1024 1025 // Quantile returns the sample of x such that x is greater than or 1026 // equal to the fraction p of samples. The exact behavior is determined by the 1027 // CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically 1028 // the inverse of the CDF function, though it may not be the actual inverse 1029 // for all values p and CumulantKinds. 1030 // 1031 // The x data must be sorted in increasing order. If weights is nil then all 1032 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 1033 // Quantile will panic if the length of x is zero. 1034 // 1035 // CumulantKind behaviors: 1036 // - Empirical: Returns the lowest value q for which q is greater than or equal 1037 // to the fraction p of samples 1038 // - LinInterp: Returns the linearly interpolated value 1039 func Quantile(p float64, c CumulantKind, x, weights []float64) float64 { 1040 if !(p >= 0 && p <= 1) { 1041 panic("stat: percentile out of bounds") 1042 } 1043 1044 if weights != nil && len(x) != len(weights) { 1045 panic("stat: slice length mismatch") 1046 } 1047 if len(x) == 0 { 1048 panic("stat: zero length slice") 1049 } 1050 if floats.HasNaN(x) { 1051 return math.NaN() // This is needed because the algorithm breaks otherwise. 1052 } 1053 if !sort.Float64sAreSorted(x) { 1054 panic("x data are not sorted") 1055 } 1056 1057 var sumWeights float64 1058 if weights == nil { 1059 sumWeights = float64(len(x)) 1060 } else { 1061 sumWeights = floats.Sum(weights) 1062 } 1063 switch c { 1064 case Empirical: 1065 return empiricalQuantile(p, x, weights, sumWeights) 1066 case LinInterp: 1067 return linInterpQuantile(p, x, weights, sumWeights) 1068 default: 1069 panic("stat: bad cumulant kind") 1070 } 1071 } 1072 1073 func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1074 var cumsum float64 1075 fidx := p * sumWeights 1076 for i := range x { 1077 if weights == nil { 1078 cumsum++ 1079 } else { 1080 cumsum += weights[i] 1081 } 1082 if cumsum >= fidx { 1083 return x[i] 1084 } 1085 } 1086 panic("impossible") 1087 } 1088 1089 func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1090 var cumsum float64 1091 fidx := p * sumWeights 1092 for i := range x { 1093 if weights == nil { 1094 cumsum++ 1095 } else { 1096 cumsum += weights[i] 1097 } 1098 if cumsum >= fidx { 1099 if i == 0 { 1100 return x[0] 1101 } 1102 t := cumsum - fidx 1103 if weights != nil { 1104 t /= weights[i] 1105 } 1106 return t*x[i-1] + (1-t)*x[i] 1107 } 1108 } 1109 panic("impossible") 1110 } 1111 1112 // Skew computes the skewness of the sample data. 1113 // If weights is nil then all of the weights are 1. If weights is not nil, then 1114 // len(x) must equal len(weights). 1115 // When weights sum to 1 or less, a biased variance estimator should be used. 1116 func Skew(x, weights []float64) float64 { 1117 1118 mean, std := MeanStdDev(x, weights) 1119 if weights == nil { 1120 var s float64 1121 for _, v := range x { 1122 z := (v - mean) / std 1123 s += z * z * z 1124 } 1125 return s * skewCorrection(float64(len(x))) 1126 } 1127 var ( 1128 s float64 1129 sumWeights float64 1130 ) 1131 for i, v := range x { 1132 z := (v - mean) / std 1133 s += weights[i] * z * z * z 1134 sumWeights += weights[i] 1135 } 1136 return s * skewCorrection(sumWeights) 1137 } 1138 1139 // From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7 1140 func skewCorrection(n float64) float64 { 1141 return (n / (n - 1)) * (1 / (n - 2)) 1142 } 1143 1144 // SortWeighted rearranges the data in x along with their corresponding 1145 // weights so that the x data are sorted. The data is sorted in place. 1146 // Weights may be nil, but if weights is non-nil then it must have the same 1147 // length as x. 1148 func SortWeighted(x, weights []float64) { 1149 if weights == nil { 1150 sort.Float64s(x) 1151 return 1152 } 1153 if len(x) != len(weights) { 1154 panic("stat: slice length mismatch") 1155 } 1156 sort.Sort(weightSorter{ 1157 x: x, 1158 w: weights, 1159 }) 1160 } 1161 1162 type weightSorter struct { 1163 x []float64 1164 w []float64 1165 } 1166 1167 func (w weightSorter) Len() int { return len(w.x) } 1168 func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] } 1169 func (w weightSorter) Swap(i, j int) { 1170 w.x[i], w.x[j] = w.x[j], w.x[i] 1171 w.w[i], w.w[j] = w.w[j], w.w[i] 1172 } 1173 1174 // SortWeightedLabeled rearranges the data in x along with their 1175 // corresponding weights and boolean labels so that the x data are sorted. 1176 // The data is sorted in place. Weights and labels may be nil, if either 1177 // is non-nil it must have the same length as x. 1178 func SortWeightedLabeled(x []float64, labels []bool, weights []float64) { 1179 if labels == nil { 1180 SortWeighted(x, weights) 1181 return 1182 } 1183 if weights == nil { 1184 if len(x) != len(labels) { 1185 panic("stat: slice length mismatch") 1186 } 1187 sort.Sort(labelSorter{ 1188 x: x, 1189 l: labels, 1190 }) 1191 return 1192 } 1193 if len(x) != len(labels) || len(x) != len(weights) { 1194 panic("stat: slice length mismatch") 1195 } 1196 sort.Sort(weightLabelSorter{ 1197 x: x, 1198 l: labels, 1199 w: weights, 1200 }) 1201 } 1202 1203 type labelSorter struct { 1204 x []float64 1205 l []bool 1206 } 1207 1208 func (a labelSorter) Len() int { return len(a.x) } 1209 func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1210 func (a labelSorter) Swap(i, j int) { 1211 a.x[i], a.x[j] = a.x[j], a.x[i] 1212 a.l[i], a.l[j] = a.l[j], a.l[i] 1213 } 1214 1215 type weightLabelSorter struct { 1216 x []float64 1217 l []bool 1218 w []float64 1219 } 1220 1221 func (a weightLabelSorter) Len() int { return len(a.x) } 1222 func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1223 func (a weightLabelSorter) Swap(i, j int) { 1224 a.x[i], a.x[j] = a.x[j], a.x[i] 1225 a.l[i], a.l[j] = a.l[j], a.l[i] 1226 a.w[i], a.w[j] = a.w[j], a.w[i] 1227 } 1228 1229 // StdDev returns the sample standard deviation. 1230 func StdDev(x, weights []float64) float64 { 1231 _, std := MeanStdDev(x, weights) 1232 return std 1233 } 1234 1235 // MeanStdDev returns the sample mean and unbiased standard deviation 1236 // When weights sum to 1 or less, a biased variance estimator should be used. 1237 func MeanStdDev(x, weights []float64) (mean, std float64) { 1238 mean, variance := MeanVariance(x, weights) 1239 return mean, math.Sqrt(variance) 1240 } 1241 1242 // StdErr returns the standard error in the mean with the given values. 1243 func StdErr(std, sampleSize float64) float64 { 1244 return std / math.Sqrt(sampleSize) 1245 } 1246 1247 // StdScore returns the standard score (a.k.a. z-score, z-value) for the value x 1248 // with the given mean and standard deviation, i.e. 1249 // (x - mean) / std 1250 func StdScore(x, mean, std float64) float64 { 1251 return (x - mean) / std 1252 } 1253 1254 // Variance computes the unbiased weighted sample variance: 1255 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1256 // If weights is nil then all of the weights are 1. If weights is not nil, then 1257 // len(x) must equal len(weights). 1258 // When weights sum to 1 or less, a biased variance estimator should be used. 1259 func Variance(x, weights []float64) float64 { 1260 _, variance := MeanVariance(x, weights) 1261 return variance 1262 } 1263 1264 // MeanVariance computes the sample mean and unbiased variance, where the mean and variance are 1265 // \sum_i w_i * x_i / (sum_i w_i) 1266 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1267 // respectively. 1268 // If weights is nil then all of the weights are 1. If weights is not nil, then 1269 // len(x) must equal len(weights). 1270 // When weights sum to 1 or less, a biased variance estimator should be used. 1271 func MeanVariance(x, weights []float64) (mean, variance float64) { 1272 var ( 1273 unnormalisedVariance float64 1274 sumWeights float64 1275 ) 1276 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1277 return mean, unnormalisedVariance / (sumWeights - 1) 1278 } 1279 1280 // PopMeanVariance computes the sample mean and biased variance (also known as 1281 // "population variance"), where the mean and variance are 1282 // \sum_i w_i * x_i / (sum_i w_i) 1283 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1284 // respectively. 1285 // If weights is nil then all of the weights are 1. If weights is not nil, then 1286 // len(x) must equal len(weights). 1287 func PopMeanVariance(x, weights []float64) (mean, variance float64) { 1288 var ( 1289 unnormalisedVariance float64 1290 sumWeights float64 1291 ) 1292 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1293 return mean, unnormalisedVariance / sumWeights 1294 } 1295 1296 // PopMeanStdDev returns the sample mean and biased standard deviation 1297 // (also known as "population standard deviation"). 1298 func PopMeanStdDev(x, weights []float64) (mean, std float64) { 1299 mean, variance := PopMeanVariance(x, weights) 1300 return mean, math.Sqrt(variance) 1301 } 1302 1303 // PopStdDev returns the population standard deviation, i.e., a square root 1304 // of the biased variance estimate. 1305 func PopStdDev(x, weights []float64) float64 { 1306 _, stDev := PopMeanStdDev(x, weights) 1307 return stDev 1308 } 1309 1310 // PopVariance computes the unbiased weighted sample variance: 1311 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1312 // If weights is nil then all of the weights are 1. If weights is not nil, then 1313 // len(x) must equal len(weights). 1314 func PopVariance(x, weights []float64) float64 { 1315 _, variance := PopMeanVariance(x, weights) 1316 return variance 1317 } 1318 1319 func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) { 1320 // This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing 1321 // the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub, 1322 // and Randall J. LeVeque. 1323 1324 // Note that this will panic if the slice lengths do not match. 1325 mean = Mean(x, weights) 1326 var ( 1327 ss float64 1328 compensation float64 1329 ) 1330 if weights == nil { 1331 for _, v := range x { 1332 d := v - mean 1333 ss += d * d 1334 compensation += d 1335 } 1336 unnormalisedVariance = (ss - compensation*compensation/float64(len(x))) 1337 return mean, unnormalisedVariance, float64(len(x)) 1338 } 1339 1340 for i, v := range x { 1341 w := weights[i] 1342 d := v - mean 1343 wd := w * d 1344 ss += wd * d 1345 compensation += wd 1346 sumWeights += w 1347 } 1348 unnormalisedVariance = (ss - compensation*compensation/sumWeights) 1349 return mean, unnormalisedVariance, sumWeights 1350 }