gonum.org/v1/gonum@v0.14.0/stat/stat.go (about) 1 // Copyright ©2014 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package stat 6 7 import ( 8 "math" 9 "sort" 10 11 "gonum.org/v1/gonum/floats" 12 ) 13 14 // CumulantKind specifies the behavior for calculating the empirical CDF or Quantile 15 type CumulantKind int 16 17 // List of supported CumulantKind values for the Quantile function. 18 // Constant values should match the R nomenclature. See 19 // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population 20 const ( 21 // Empirical treats the distribution as the actual empirical distribution. 22 Empirical CumulantKind = 1 23 // LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation. 24 LinInterp CumulantKind = 4 25 ) 26 27 // bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by: 28 // 29 // \sum_i \sqrt{p_i q_i} 30 // 31 // It is assumed that p and q have equal length. 32 func bhattacharyyaCoeff(p, q []float64) float64 { 33 var bc float64 34 for i, a := range p { 35 bc += math.Sqrt(a * q[i]) 36 } 37 return bc 38 } 39 40 // Bhattacharyya computes the distance between the probability distributions p and q given by: 41 // 42 // -\ln ( \sum_i \sqrt{p_i q_i} ) 43 // 44 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 45 func Bhattacharyya(p, q []float64) float64 { 46 if len(p) != len(q) { 47 panic("stat: slice length mismatch") 48 } 49 bc := bhattacharyyaCoeff(p, q) 50 return -math.Log(bc) 51 } 52 53 // CDF returns the empirical cumulative distribution function value of x, that is 54 // the fraction of the samples less than or equal to q. The 55 // exact behavior is determined by the CumulantKind. CDF is theoretically 56 // the inverse of the Quantile function, though it may not be the actual inverse 57 // for all values q and CumulantKinds. 58 // 59 // The x data must be sorted in increasing order. If weights is nil then all 60 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 61 // CDF will panic if the length of x is zero. 62 // 63 // CumulantKind behaviors: 64 // - Empirical: Returns the lowest fraction for which q is greater than or equal 65 // to that fraction of samples 66 func CDF(q float64, c CumulantKind, x, weights []float64) float64 { 67 if weights != nil && len(x) != len(weights) { 68 panic("stat: slice length mismatch") 69 } 70 if floats.HasNaN(x) { 71 return math.NaN() 72 } 73 if len(x) == 0 { 74 panic("stat: zero length slice") 75 } 76 if !sort.Float64sAreSorted(x) { 77 panic("x data are not sorted") 78 } 79 80 if q < x[0] { 81 return 0 82 } 83 if q >= x[len(x)-1] { 84 return 1 85 } 86 87 var sumWeights float64 88 if weights == nil { 89 sumWeights = float64(len(x)) 90 } else { 91 sumWeights = floats.Sum(weights) 92 } 93 94 // Calculate the index 95 switch c { 96 case Empirical: 97 // Find the smallest value that is greater than that percent of the samples 98 var w float64 99 for i, v := range x { 100 if v > q { 101 return w / sumWeights 102 } 103 if weights == nil { 104 w++ 105 } else { 106 w += weights[i] 107 } 108 } 109 panic("impossible") 110 default: 111 panic("stat: bad cumulant kind") 112 } 113 } 114 115 // ChiSquare computes the chi-square distance between the observed frequencies 'obs' and 116 // expected frequencies 'exp' given by: 117 // 118 // \sum_i (obs_i-exp_i)^2 / exp_i 119 // 120 // The lengths of obs and exp must be equal. 121 func ChiSquare(obs, exp []float64) float64 { 122 if len(obs) != len(exp) { 123 panic("stat: slice length mismatch") 124 } 125 var result float64 126 for i, a := range obs { 127 b := exp[i] 128 if a == 0 && b == 0 { 129 continue 130 } 131 result += (a - b) * (a - b) / b 132 } 133 return result 134 } 135 136 // CircularMean returns the circular mean of the dataset. 137 // 138 // atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i)) 139 // 140 // If weights is nil then all of the weights are 1. If weights is not nil, then 141 // len(x) must equal len(weights). 142 func CircularMean(x, weights []float64) float64 { 143 if weights != nil && len(x) != len(weights) { 144 panic("stat: slice length mismatch") 145 } 146 147 var aX, aY float64 148 if weights != nil { 149 for i, v := range x { 150 aX += weights[i] * math.Cos(v) 151 aY += weights[i] * math.Sin(v) 152 } 153 } else { 154 for _, v := range x { 155 aX += math.Cos(v) 156 aY += math.Sin(v) 157 } 158 } 159 160 return math.Atan2(aY, aX) 161 } 162 163 // Correlation returns the weighted correlation between the samples of x and y 164 // with the given means. 165 // 166 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY) 167 // 168 // The lengths of x and y must be equal. If weights is nil then all of the 169 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 170 func Correlation(x, y, weights []float64) float64 { 171 // This is a two-pass corrected implementation. It is an adaptation of the 172 // algorithm used in the MeanVariance function, which applies a correction 173 // to the typical two pass approach. 174 175 if len(x) != len(y) { 176 panic("stat: slice length mismatch") 177 } 178 xu := Mean(x, weights) 179 yu := Mean(y, weights) 180 var ( 181 sxx float64 182 syy float64 183 sxy float64 184 xcompensation float64 185 ycompensation float64 186 ) 187 if weights == nil { 188 for i, xv := range x { 189 yv := y[i] 190 xd := xv - xu 191 yd := yv - yu 192 sxx += xd * xd 193 syy += yd * yd 194 sxy += xd * yd 195 xcompensation += xd 196 ycompensation += yd 197 } 198 // xcompensation and ycompensation are from Chan, et. al. 199 // referenced in the MeanVariance function. They are analogous 200 // to the second term in (1.7) in that paper. 201 sxx -= xcompensation * xcompensation / float64(len(x)) 202 syy -= ycompensation * ycompensation / float64(len(x)) 203 204 return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy) 205 206 } 207 208 var sumWeights float64 209 for i, xv := range x { 210 w := weights[i] 211 yv := y[i] 212 xd := xv - xu 213 wxd := w * xd 214 yd := yv - yu 215 wyd := w * yd 216 sxx += wxd * xd 217 syy += wyd * yd 218 sxy += wxd * yd 219 xcompensation += wxd 220 ycompensation += wyd 221 sumWeights += w 222 } 223 // xcompensation and ycompensation are from Chan, et. al. 224 // referenced in the MeanVariance function. They are analogous 225 // to the second term in (1.7) in that paper, except they use 226 // the sumWeights instead of the sample count. 227 sxx -= xcompensation * xcompensation / sumWeights 228 syy -= ycompensation * ycompensation / sumWeights 229 230 return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy) 231 } 232 233 // Kendall returns the weighted Tau-a Kendall correlation between the 234 // samples of x and y. The Kendall correlation measures the quantity of 235 // concordant and discordant pairs of numbers. If weights are specified then 236 // each pair is weighted by weights[i] * weights[j] and the final sum is 237 // normalized to stay between -1 and 1. 238 // The lengths of x and y must be equal. If weights is nil then all of the 239 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 240 func Kendall(x, y, weights []float64) float64 { 241 if len(x) != len(y) { 242 panic("stat: slice length mismatch") 243 } 244 245 var ( 246 cc float64 // number of concordant pairs 247 dc float64 // number of discordant pairs 248 n = len(x) 249 ) 250 251 if weights == nil { 252 for i := 0; i < n; i++ { 253 for j := i; j < n; j++ { 254 if i == j { 255 continue 256 } 257 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 258 cc++ 259 } else { 260 dc++ 261 } 262 } 263 } 264 return (cc - dc) / float64(n*(n-1)/2) 265 } 266 267 var sumWeights float64 268 269 for i := 0; i < n; i++ { 270 for j := i; j < n; j++ { 271 if i == j { 272 continue 273 } 274 weight := weights[i] * weights[j] 275 if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) { 276 cc += weight 277 } else { 278 dc += weight 279 } 280 sumWeights += weight 281 } 282 } 283 return float64(cc-dc) / sumWeights 284 } 285 286 // Covariance returns the weighted covariance between the samples of x and y. 287 // 288 // sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1) 289 // 290 // The lengths of x and y must be equal. If weights is nil then all of the 291 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 292 func Covariance(x, y, weights []float64) float64 { 293 // This is a two-pass corrected implementation. It is an adaptation of the 294 // algorithm used in the MeanVariance function, which applies a correction 295 // to the typical two pass approach. 296 297 if len(x) != len(y) { 298 panic("stat: slice length mismatch") 299 } 300 xu := Mean(x, weights) 301 yu := Mean(y, weights) 302 return covarianceMeans(x, y, weights, xu, yu) 303 } 304 305 // covarianceMeans returns the weighted covariance between x and y with the mean 306 // of x and y already specified. See the documentation of Covariance for more 307 // information. 308 func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 { 309 var ( 310 ss float64 311 xcompensation float64 312 ycompensation float64 313 ) 314 if weights == nil { 315 for i, xv := range x { 316 yv := y[i] 317 xd := xv - xu 318 yd := yv - yu 319 ss += xd * yd 320 xcompensation += xd 321 ycompensation += yd 322 } 323 // xcompensation and ycompensation are from Chan, et. al. 324 // referenced in the MeanVariance function. They are analogous 325 // to the second term in (1.7) in that paper. 326 return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1) 327 } 328 329 var sumWeights float64 330 331 for i, xv := range x { 332 w := weights[i] 333 yv := y[i] 334 wxd := w * (xv - xu) 335 yd := (yv - yu) 336 ss += wxd * yd 337 xcompensation += wxd 338 ycompensation += w * yd 339 sumWeights += w 340 } 341 // xcompensation and ycompensation are from Chan, et. al. 342 // referenced in the MeanVariance function. They are analogous 343 // to the second term in (1.7) in that paper, except they use 344 // the sumWeights instead of the sample count. 345 return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1) 346 } 347 348 // CrossEntropy computes the cross-entropy between the two distributions specified 349 // in p and q. 350 func CrossEntropy(p, q []float64) float64 { 351 if len(p) != len(q) { 352 panic("stat: slice length mismatch") 353 } 354 var ce float64 355 for i, v := range p { 356 if v != 0 { 357 ce -= v * math.Log(q[i]) 358 } 359 } 360 return ce 361 } 362 363 // Entropy computes the Shannon entropy of a distribution or the distance between 364 // two distributions. The natural logarithm is used. 365 // - sum_i (p_i * log_e(p_i)) 366 func Entropy(p []float64) float64 { 367 var e float64 368 for _, v := range p { 369 if v != 0 { // Entropy needs 0 * log(0) == 0. 370 e -= v * math.Log(v) 371 } 372 } 373 return e 374 } 375 376 // ExKurtosis returns the population excess kurtosis of the sample. 377 // The kurtosis is defined by the 4th moment of the mean divided by the squared 378 // variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of 379 // the normal distribution is zero. 380 // If weights is nil then all of the weights are 1. If weights is not nil, then 381 // len(x) must equal len(weights). 382 func ExKurtosis(x, weights []float64) float64 { 383 mean, std := MeanStdDev(x, weights) 384 if weights == nil { 385 var e float64 386 for _, v := range x { 387 z := (v - mean) / std 388 e += z * z * z * z 389 } 390 mul, offset := kurtosisCorrection(float64(len(x))) 391 return e*mul - offset 392 } 393 394 var ( 395 e float64 396 sumWeights float64 397 ) 398 for i, v := range x { 399 z := (v - mean) / std 400 e += weights[i] * z * z * z * z 401 sumWeights += weights[i] 402 } 403 mul, offset := kurtosisCorrection(sumWeights) 404 return e*mul - offset 405 } 406 407 // n is the number of samples 408 // see https://en.wikipedia.org/wiki/Kurtosis 409 func kurtosisCorrection(n float64) (mul, offset float64) { 410 return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3)) 411 } 412 413 // GeometricMean returns the weighted geometric mean of the dataset 414 // 415 // \prod_i {x_i ^ w_i} 416 // 417 // This only applies with positive x and positive weights. If weights is nil 418 // then all of the weights are 1. If weights is not nil, then len(x) must equal 419 // len(weights). 420 func GeometricMean(x, weights []float64) float64 { 421 if weights == nil { 422 var s float64 423 for _, v := range x { 424 s += math.Log(v) 425 } 426 s /= float64(len(x)) 427 return math.Exp(s) 428 } 429 if len(x) != len(weights) { 430 panic("stat: slice length mismatch") 431 } 432 var ( 433 s float64 434 sumWeights float64 435 ) 436 for i, v := range x { 437 s += weights[i] * math.Log(v) 438 sumWeights += weights[i] 439 } 440 s /= sumWeights 441 return math.Exp(s) 442 } 443 444 // HarmonicMean returns the weighted harmonic mean of the dataset 445 // 446 // \sum_i {w_i} / ( sum_i {w_i / x_i} ) 447 // 448 // This only applies with positive x and positive weights. 449 // If weights is nil then all of the weights are 1. If weights is not nil, then 450 // len(x) must equal len(weights). 451 func HarmonicMean(x, weights []float64) float64 { 452 if weights != nil && len(x) != len(weights) { 453 panic("stat: slice length mismatch") 454 } 455 // TODO(btracey): Fix this to make it more efficient and avoid allocation. 456 457 // This can be numerically unstable (for example if x is very small). 458 // W = \sum_i {w_i} 459 // hm = exp(log(W) - log(\sum_i w_i / x_i)) 460 461 logs := make([]float64, len(x)) 462 var W float64 463 for i := range x { 464 if weights == nil { 465 logs[i] = -math.Log(x[i]) 466 W++ 467 continue 468 } 469 logs[i] = math.Log(weights[i]) - math.Log(x[i]) 470 W += weights[i] 471 } 472 473 // Sum all of the logs 474 v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}). 475 return math.Exp(math.Log(W) - v) 476 } 477 478 // Hellinger computes the distance between the probability distributions p and q given by: 479 // 480 // \sqrt{ 1 - \sum_i \sqrt{p_i q_i} } 481 // 482 // The lengths of p and q must be equal. It is assumed that p and q sum to 1. 483 func Hellinger(p, q []float64) float64 { 484 if len(p) != len(q) { 485 panic("stat: slice length mismatch") 486 } 487 bc := bhattacharyyaCoeff(p, q) 488 return math.Sqrt(1 - bc) 489 } 490 491 // Histogram sums up the weighted number of data points in each bin. 492 // The weight of data point x[i] will be placed into count[j] if 493 // dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist 494 // with bin creation. 495 // 496 // The following conditions on the inputs apply: 497 // - The count variable must either be nil or have length of one less than dividers. 498 // - The values in dividers must be sorted (use the sort package). 499 // - The x values must be sorted. 500 // - If weights is nil then all of the weights are 1. 501 // - If weights is not nil, then len(x) must equal len(weights). 502 func Histogram(count, dividers, x, weights []float64) []float64 { 503 if weights != nil && len(x) != len(weights) { 504 panic("stat: slice length mismatch") 505 } 506 if count == nil { 507 count = make([]float64, len(dividers)-1) 508 } 509 if len(dividers) < 2 { 510 panic("histogram: fewer than two dividers") 511 } 512 if len(count) != len(dividers)-1 { 513 panic("histogram: bin count mismatch") 514 } 515 if !sort.Float64sAreSorted(dividers) { 516 panic("histogram: dividers are not sorted") 517 } 518 if !sort.Float64sAreSorted(x) { 519 panic("histogram: x data are not sorted") 520 } 521 for i := range count { 522 count[i] = 0 523 } 524 if len(x) == 0 { 525 return count 526 } 527 if x[0] < dividers[0] { 528 panic("histogram: minimum x value is less than lowest divider") 529 } 530 if dividers[len(dividers)-1] <= x[len(x)-1] { 531 panic("histogram: maximum x value is greater than or equal to highest divider") 532 } 533 534 idx := 0 535 comp := dividers[idx+1] 536 if weights == nil { 537 for _, v := range x { 538 if v < comp { 539 // Still in the current bucket. 540 count[idx]++ 541 continue 542 } 543 // Find the next divider where v is less than the divider. 544 for j := idx + 1; j < len(dividers); j++ { 545 if v < dividers[j+1] { 546 idx = j 547 comp = dividers[j+1] 548 break 549 } 550 } 551 count[idx]++ 552 } 553 return count 554 } 555 556 for i, v := range x { 557 if v < comp { 558 // Still in the current bucket. 559 count[idx] += weights[i] 560 continue 561 } 562 // Need to find the next divider where v is less than the divider. 563 for j := idx + 1; j < len(count); j++ { 564 if v < dividers[j+1] { 565 idx = j 566 comp = dividers[j+1] 567 break 568 } 569 } 570 count[idx] += weights[i] 571 } 572 return count 573 } 574 575 // JensenShannon computes the JensenShannon divergence between the distributions 576 // p and q. The Jensen-Shannon divergence is defined as 577 // 578 // m = 0.5 * (p + q) 579 // JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) ) 580 // 581 // Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value 582 // is between 0 and ln(2). 583 func JensenShannon(p, q []float64) float64 { 584 if len(p) != len(q) { 585 panic("stat: slice length mismatch") 586 } 587 var js float64 588 for i, v := range p { 589 qi := q[i] 590 m := 0.5 * (v + qi) 591 if v != 0 { 592 // add kl from p to m 593 js += 0.5 * v * (math.Log(v) - math.Log(m)) 594 } 595 if qi != 0 { 596 // add kl from q to m 597 js += 0.5 * qi * (math.Log(qi) - math.Log(m)) 598 } 599 } 600 return js 601 } 602 603 // KolmogorovSmirnov computes the largest distance between two empirical CDFs. 604 // Each dataset x and y consists of sample locations and counts, xWeights and 605 // yWeights, respectively. 606 // 607 // x and y may have different lengths, though len(x) must equal len(xWeights), and 608 // len(y) must equal len(yWeights). Both x and y must be sorted. 609 // 610 // Special cases are: 611 // 612 // = 0 if len(x) == len(y) == 0 613 // = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0 614 func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 { 615 if xWeights != nil && len(x) != len(xWeights) { 616 panic("stat: slice length mismatch") 617 } 618 if yWeights != nil && len(y) != len(yWeights) { 619 panic("stat: slice length mismatch") 620 } 621 if len(x) == 0 || len(y) == 0 { 622 if len(x) == 0 && len(y) == 0 { 623 return 0 624 } 625 return 1 626 } 627 628 if floats.HasNaN(x) { 629 return math.NaN() 630 } 631 if floats.HasNaN(y) { 632 return math.NaN() 633 } 634 635 if !sort.Float64sAreSorted(x) { 636 panic("x data are not sorted") 637 } 638 if !sort.Float64sAreSorted(y) { 639 panic("y data are not sorted") 640 } 641 642 xWeightsNil := xWeights == nil 643 yWeightsNil := yWeights == nil 644 645 var ( 646 maxDist float64 647 xSum, ySum float64 648 xCdf, yCdf float64 649 xIdx, yIdx int 650 ) 651 652 if xWeightsNil { 653 xSum = float64(len(x)) 654 } else { 655 xSum = floats.Sum(xWeights) 656 } 657 658 if yWeightsNil { 659 ySum = float64(len(y)) 660 } else { 661 ySum = floats.Sum(yWeights) 662 } 663 664 xVal := x[0] 665 yVal := y[0] 666 667 // Algorithm description: 668 // The goal is to find the maximum difference in the empirical CDFs for the 669 // two datasets. The CDFs are piecewise-constant, and thus the distance 670 // between the CDFs will only change at the values themselves. 671 // 672 // To find the maximum distance, step through the data in ascending order 673 // of value between the two datasets. At each step, compute the empirical CDF 674 // and compare the local distance with the maximum distance. 675 // Due to some corner cases, equal data entries must be tallied simultaneously. 676 for { 677 switch { 678 case xVal < yVal: 679 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 680 case yVal < xVal: 681 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 682 case xVal == yVal: 683 newX := x[xIdx] 684 newY := y[yIdx] 685 if newX < newY { 686 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 687 } else if newY < newX { 688 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 689 } else { 690 // Update them both, they'll be equal next time and the right 691 // thing will happen. 692 xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) 693 yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) 694 } 695 default: 696 panic("unreachable") 697 } 698 699 dist := math.Abs(xCdf - yCdf) 700 if dist > maxDist { 701 maxDist = dist 702 } 703 704 // Both xCdf and yCdf will equal 1 at the end, so if we have reached the 705 // end of either sample list, the distance is as large as it can be. 706 if xIdx == len(x) || yIdx == len(y) { 707 return maxDist 708 } 709 } 710 } 711 712 // updateKS gets the next data point from one of the set. In doing so, it combines 713 // the weight of all the data points of equal value. Upon return, val is the new 714 // value of the data set, newCdf is the total combined CDF up until this point, 715 // and newIdx is the index of the next location in that sample to examine. 716 func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) { 717 // Sum up all the weights of consecutive values that are equal. 718 if isNil { 719 newCdf = cdf + 1/sum 720 } else { 721 newCdf = cdf + weights[idx]/sum 722 } 723 newIdx = idx + 1 724 for { 725 if newIdx == len(values) { 726 return values[newIdx-1], newCdf, newIdx 727 } 728 if values[newIdx-1] != values[newIdx] { 729 return values[newIdx], newCdf, newIdx 730 } 731 if isNil { 732 newCdf += 1 / sum 733 } else { 734 newCdf += weights[newIdx] / sum 735 } 736 newIdx++ 737 } 738 } 739 740 // KullbackLeibler computes the Kullback-Leibler distance between the 741 // distributions p and q. The natural logarithm is used. 742 // 743 // sum_i(p_i * log(p_i / q_i)) 744 // 745 // Note that the Kullback-Leibler distance is not symmetric; 746 // KullbackLeibler(p,q) != KullbackLeibler(q,p) 747 func KullbackLeibler(p, q []float64) float64 { 748 if len(p) != len(q) { 749 panic("stat: slice length mismatch") 750 } 751 var kl float64 752 for i, v := range p { 753 if v != 0 { // Entropy needs 0 * log(0) == 0. 754 kl += v * (math.Log(v) - math.Log(q[i])) 755 } 756 } 757 return kl 758 } 759 760 // LinearRegression computes the best-fit line 761 // 762 // y = alpha + beta*x 763 // 764 // to the data in x and y with the given weights. If origin is true, the 765 // regression is forced to pass through the origin. 766 // 767 // Specifically, LinearRegression computes the values of alpha and 768 // beta such that the total residual 769 // 770 // \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 771 // 772 // is minimized. If origin is true, then alpha is forced to be zero. 773 // 774 // The lengths of x and y must be equal. If weights is nil then all of the 775 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 776 func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { 777 if len(x) != len(y) { 778 panic("stat: slice length mismatch") 779 } 780 if weights != nil && len(weights) != len(x) { 781 panic("stat: slice length mismatch") 782 } 783 784 w := 1.0 785 if origin { 786 var x2Sum, xySum float64 787 for i, xi := range x { 788 if weights != nil { 789 w = weights[i] 790 } 791 yi := y[i] 792 xySum += w * xi * yi 793 x2Sum += w * xi * xi 794 } 795 beta = xySum / x2Sum 796 797 return 0, beta 798 } 799 800 xu, xv := MeanVariance(x, weights) 801 yu := Mean(y, weights) 802 cov := covarianceMeans(x, y, weights, xu, yu) 803 beta = cov / xv 804 alpha = yu - beta*xu 805 return alpha, beta 806 } 807 808 // RSquared returns the coefficient of determination defined as 809 // 810 // R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2 811 // 812 // for the line 813 // 814 // y = alpha + beta*x 815 // 816 // and the data in x and y with the given weights. 817 // 818 // The lengths of x and y must be equal. If weights is nil then all of the 819 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 820 func RSquared(x, y, weights []float64, alpha, beta float64) float64 { 821 if len(x) != len(y) { 822 panic("stat: slice length mismatch") 823 } 824 if weights != nil && len(weights) != len(x) { 825 panic("stat: slice length mismatch") 826 } 827 828 w := 1.0 829 yMean := Mean(y, weights) 830 var res, tot, d float64 831 for i, xi := range x { 832 if weights != nil { 833 w = weights[i] 834 } 835 yi := y[i] 836 fi := alpha + beta*xi 837 d = yi - fi 838 res += w * d * d 839 d = yi - yMean 840 tot += w * d * d 841 } 842 return 1 - res/tot 843 } 844 845 // RSquaredFrom returns the coefficient of determination defined as 846 // 847 // R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2 848 // 849 // and the data in estimates and values with the given weights. 850 // 851 // The lengths of estimates and values must be equal. If weights is nil then 852 // all of the weights are 1. If weights is not nil, then len(values) must 853 // equal len(weights). 854 func RSquaredFrom(estimates, values, weights []float64) float64 { 855 if len(estimates) != len(values) { 856 panic("stat: slice length mismatch") 857 } 858 if weights != nil && len(weights) != len(values) { 859 panic("stat: slice length mismatch") 860 } 861 862 w := 1.0 863 mean := Mean(values, weights) 864 var res, tot, d float64 865 for i, val := range values { 866 if weights != nil { 867 w = weights[i] 868 } 869 d = val - estimates[i] 870 res += w * d * d 871 d = val - mean 872 tot += w * d * d 873 } 874 return 1 - res/tot 875 } 876 877 // RNoughtSquared returns the coefficient of determination defined as 878 // 879 // R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2 880 // 881 // for the line 882 // 883 // y = beta*x 884 // 885 // and the data in x and y with the given weights. RNoughtSquared should 886 // only be used for best-fit lines regressed through the origin. 887 // 888 // The lengths of x and y must be equal. If weights is nil then all of the 889 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 890 func RNoughtSquared(x, y, weights []float64, beta float64) float64 { 891 if len(x) != len(y) { 892 panic("stat: slice length mismatch") 893 } 894 if weights != nil && len(weights) != len(x) { 895 panic("stat: slice length mismatch") 896 } 897 898 w := 1.0 899 var ssr, tot float64 900 for i, xi := range x { 901 if weights != nil { 902 w = weights[i] 903 } 904 fi := beta * xi 905 ssr += w * fi * fi 906 yi := y[i] 907 tot += w * yi * yi 908 } 909 return ssr / tot 910 } 911 912 // Mean computes the weighted mean of the data set. 913 // 914 // sum_i {w_i * x_i} / sum_i {w_i} 915 // 916 // If weights is nil then all of the weights are 1. If weights is not nil, then 917 // len(x) must equal len(weights). 918 func Mean(x, weights []float64) float64 { 919 if weights == nil { 920 return floats.Sum(x) / float64(len(x)) 921 } 922 if len(x) != len(weights) { 923 panic("stat: slice length mismatch") 924 } 925 var ( 926 sumValues float64 927 sumWeights float64 928 ) 929 for i, w := range weights { 930 sumValues += w * x[i] 931 sumWeights += w 932 } 933 return sumValues / sumWeights 934 } 935 936 // Mode returns the most common value in the dataset specified by x and the 937 // given weights. Strict float64 equality is used when comparing values, so users 938 // should take caution. If several values are the mode, any of them may be returned. 939 func Mode(x, weights []float64) (val float64, count float64) { 940 if weights != nil && len(x) != len(weights) { 941 panic("stat: slice length mismatch") 942 } 943 if len(x) == 0 { 944 return 0, 0 945 } 946 m := make(map[float64]float64) 947 if weights == nil { 948 for _, v := range x { 949 m[v]++ 950 } 951 } else { 952 for i, v := range x { 953 m[v] += weights[i] 954 } 955 } 956 var ( 957 maxCount float64 958 max float64 959 ) 960 for val, count := range m { 961 if count > maxCount { 962 maxCount = count 963 max = val 964 } 965 } 966 return max, maxCount 967 } 968 969 // BivariateMoment computes the weighted mixed moment between the samples x and y. 970 // 971 // E[(x - μ_x)^r*(y - μ_y)^s] 972 // 973 // No degrees of freedom correction is done. 974 // The lengths of x and y must be equal. If weights is nil then all of the 975 // weights are 1. If weights is not nil, then len(x) must equal len(weights). 976 func BivariateMoment(r, s float64, x, y, weights []float64) float64 { 977 meanX := Mean(x, weights) 978 meanY := Mean(y, weights) 979 if len(x) != len(y) { 980 panic("stat: slice length mismatch") 981 } 982 if weights == nil { 983 var m float64 984 for i, vx := range x { 985 vy := y[i] 986 m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 987 } 988 return m / float64(len(x)) 989 } 990 if len(weights) != len(x) { 991 panic("stat: slice length mismatch") 992 } 993 var ( 994 m float64 995 sumWeights float64 996 ) 997 for i, vx := range x { 998 vy := y[i] 999 w := weights[i] 1000 m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s) 1001 sumWeights += w 1002 } 1003 return m / sumWeights 1004 } 1005 1006 // Moment computes the weighted n^th moment of the samples, 1007 // 1008 // E[(x - μ)^N] 1009 // 1010 // No degrees of freedom correction is done. 1011 // If weights is nil then all of the weights are 1. If weights is not nil, then 1012 // len(x) must equal len(weights). 1013 func Moment(moment float64, x, weights []float64) float64 { 1014 // This also checks that x and weights have the same length. 1015 mean := Mean(x, weights) 1016 if weights == nil { 1017 var m float64 1018 for _, v := range x { 1019 m += math.Pow(v-mean, moment) 1020 } 1021 return m / float64(len(x)) 1022 } 1023 var ( 1024 m float64 1025 sumWeights float64 1026 ) 1027 for i, v := range x { 1028 w := weights[i] 1029 m += w * math.Pow(v-mean, moment) 1030 sumWeights += w 1031 } 1032 return m / sumWeights 1033 } 1034 1035 // MomentAbout computes the weighted n^th weighted moment of the samples about 1036 // the given mean \mu, 1037 // 1038 // E[(x - μ)^N] 1039 // 1040 // No degrees of freedom correction is done. 1041 // If weights is nil then all of the weights are 1. If weights is not nil, then 1042 // len(x) must equal len(weights). 1043 func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 { 1044 if weights == nil { 1045 var m float64 1046 for _, v := range x { 1047 m += math.Pow(v-mean, moment) 1048 } 1049 m /= float64(len(x)) 1050 return m 1051 } 1052 if len(weights) != len(x) { 1053 panic("stat: slice length mismatch") 1054 } 1055 var ( 1056 m float64 1057 sumWeights float64 1058 ) 1059 for i, v := range x { 1060 m += weights[i] * math.Pow(v-mean, moment) 1061 sumWeights += weights[i] 1062 } 1063 return m / sumWeights 1064 } 1065 1066 // Quantile returns the sample of x such that x is greater than or 1067 // equal to the fraction p of samples. The exact behavior is determined by the 1068 // CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically 1069 // the inverse of the CDF function, though it may not be the actual inverse 1070 // for all values p and CumulantKinds. 1071 // 1072 // The x data must be sorted in increasing order. If weights is nil then all 1073 // of the weights are 1. If weights is not nil, then len(x) must equal len(weights). 1074 // Quantile will panic if the length of x is zero. 1075 // 1076 // CumulantKind behaviors: 1077 // - Empirical: Returns the lowest value q for which q is greater than or equal 1078 // to the fraction p of samples 1079 // - LinInterp: Returns the linearly interpolated value 1080 func Quantile(p float64, c CumulantKind, x, weights []float64) float64 { 1081 if !(p >= 0 && p <= 1) { 1082 panic("stat: percentile out of bounds") 1083 } 1084 1085 if weights != nil && len(x) != len(weights) { 1086 panic("stat: slice length mismatch") 1087 } 1088 if len(x) == 0 { 1089 panic("stat: zero length slice") 1090 } 1091 if floats.HasNaN(x) { 1092 return math.NaN() // This is needed because the algorithm breaks otherwise. 1093 } 1094 if !sort.Float64sAreSorted(x) { 1095 panic("x data are not sorted") 1096 } 1097 1098 var sumWeights float64 1099 if weights == nil { 1100 sumWeights = float64(len(x)) 1101 } else { 1102 sumWeights = floats.Sum(weights) 1103 } 1104 switch c { 1105 case Empirical: 1106 return empiricalQuantile(p, x, weights, sumWeights) 1107 case LinInterp: 1108 return linInterpQuantile(p, x, weights, sumWeights) 1109 default: 1110 panic("stat: bad cumulant kind") 1111 } 1112 } 1113 1114 func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1115 var cumsum float64 1116 fidx := p * sumWeights 1117 for i := range x { 1118 if weights == nil { 1119 cumsum++ 1120 } else { 1121 cumsum += weights[i] 1122 } 1123 if cumsum >= fidx { 1124 return x[i] 1125 } 1126 } 1127 panic("impossible") 1128 } 1129 1130 func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 { 1131 var cumsum float64 1132 fidx := p * sumWeights 1133 for i := range x { 1134 if weights == nil { 1135 cumsum++ 1136 } else { 1137 cumsum += weights[i] 1138 } 1139 if cumsum >= fidx { 1140 if i == 0 { 1141 return x[0] 1142 } 1143 t := cumsum - fidx 1144 if weights != nil { 1145 t /= weights[i] 1146 } 1147 return t*x[i-1] + (1-t)*x[i] 1148 } 1149 } 1150 panic("impossible") 1151 } 1152 1153 // Skew computes the skewness of the sample data. 1154 // If weights is nil then all of the weights are 1. If weights is not nil, then 1155 // len(x) must equal len(weights). 1156 // When weights sum to 1 or less, a biased variance estimator should be used. 1157 func Skew(x, weights []float64) float64 { 1158 1159 mean, std := MeanStdDev(x, weights) 1160 if weights == nil { 1161 var s float64 1162 for _, v := range x { 1163 z := (v - mean) / std 1164 s += z * z * z 1165 } 1166 return s * skewCorrection(float64(len(x))) 1167 } 1168 var ( 1169 s float64 1170 sumWeights float64 1171 ) 1172 for i, v := range x { 1173 z := (v - mean) / std 1174 s += weights[i] * z * z * z 1175 sumWeights += weights[i] 1176 } 1177 return s * skewCorrection(sumWeights) 1178 } 1179 1180 // From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7 1181 func skewCorrection(n float64) float64 { 1182 return (n / (n - 1)) * (1 / (n - 2)) 1183 } 1184 1185 // SortWeighted rearranges the data in x along with their corresponding 1186 // weights so that the x data are sorted. The data is sorted in place. 1187 // Weights may be nil, but if weights is non-nil then it must have the same 1188 // length as x. 1189 func SortWeighted(x, weights []float64) { 1190 if weights == nil { 1191 sort.Float64s(x) 1192 return 1193 } 1194 if len(x) != len(weights) { 1195 panic("stat: slice length mismatch") 1196 } 1197 sort.Sort(weightSorter{ 1198 x: x, 1199 w: weights, 1200 }) 1201 } 1202 1203 type weightSorter struct { 1204 x []float64 1205 w []float64 1206 } 1207 1208 func (w weightSorter) Len() int { return len(w.x) } 1209 func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] } 1210 func (w weightSorter) Swap(i, j int) { 1211 w.x[i], w.x[j] = w.x[j], w.x[i] 1212 w.w[i], w.w[j] = w.w[j], w.w[i] 1213 } 1214 1215 // SortWeightedLabeled rearranges the data in x along with their 1216 // corresponding weights and boolean labels so that the x data are sorted. 1217 // The data is sorted in place. Weights and labels may be nil, if either 1218 // is non-nil it must have the same length as x. 1219 func SortWeightedLabeled(x []float64, labels []bool, weights []float64) { 1220 if labels == nil { 1221 SortWeighted(x, weights) 1222 return 1223 } 1224 if weights == nil { 1225 if len(x) != len(labels) { 1226 panic("stat: slice length mismatch") 1227 } 1228 sort.Sort(labelSorter{ 1229 x: x, 1230 l: labels, 1231 }) 1232 return 1233 } 1234 if len(x) != len(labels) || len(x) != len(weights) { 1235 panic("stat: slice length mismatch") 1236 } 1237 sort.Sort(weightLabelSorter{ 1238 x: x, 1239 l: labels, 1240 w: weights, 1241 }) 1242 } 1243 1244 type labelSorter struct { 1245 x []float64 1246 l []bool 1247 } 1248 1249 func (a labelSorter) Len() int { return len(a.x) } 1250 func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1251 func (a labelSorter) Swap(i, j int) { 1252 a.x[i], a.x[j] = a.x[j], a.x[i] 1253 a.l[i], a.l[j] = a.l[j], a.l[i] 1254 } 1255 1256 type weightLabelSorter struct { 1257 x []float64 1258 l []bool 1259 w []float64 1260 } 1261 1262 func (a weightLabelSorter) Len() int { return len(a.x) } 1263 func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } 1264 func (a weightLabelSorter) Swap(i, j int) { 1265 a.x[i], a.x[j] = a.x[j], a.x[i] 1266 a.l[i], a.l[j] = a.l[j], a.l[i] 1267 a.w[i], a.w[j] = a.w[j], a.w[i] 1268 } 1269 1270 // StdDev returns the sample standard deviation. 1271 func StdDev(x, weights []float64) float64 { 1272 _, std := MeanStdDev(x, weights) 1273 return std 1274 } 1275 1276 // MeanStdDev returns the sample mean and unbiased standard deviation 1277 // When weights sum to 1 or less, a biased variance estimator should be used. 1278 func MeanStdDev(x, weights []float64) (mean, std float64) { 1279 mean, variance := MeanVariance(x, weights) 1280 return mean, math.Sqrt(variance) 1281 } 1282 1283 // StdErr returns the standard error in the mean with the given values. 1284 func StdErr(std, sampleSize float64) float64 { 1285 return std / math.Sqrt(sampleSize) 1286 } 1287 1288 // StdScore returns the standard score (a.k.a. z-score, z-value) for the value x 1289 // with the given mean and standard deviation, i.e. 1290 // 1291 // (x - mean) / std 1292 func StdScore(x, mean, std float64) float64 { 1293 return (x - mean) / std 1294 } 1295 1296 // Variance computes the unbiased weighted sample variance: 1297 // 1298 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1299 // 1300 // If weights is nil then all of the weights are 1. If weights is not nil, then 1301 // len(x) must equal len(weights). 1302 // When weights sum to 1 or less, a biased variance estimator should be used. 1303 func Variance(x, weights []float64) float64 { 1304 _, variance := MeanVariance(x, weights) 1305 return variance 1306 } 1307 1308 // MeanVariance computes the sample mean and unbiased variance, where the mean and variance are 1309 // 1310 // \sum_i w_i * x_i / (sum_i w_i) 1311 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) 1312 // 1313 // respectively. 1314 // If weights is nil then all of the weights are 1. If weights is not nil, then 1315 // len(x) must equal len(weights). 1316 // When weights sum to 1 or less, a biased variance estimator should be used. 1317 func MeanVariance(x, weights []float64) (mean, variance float64) { 1318 var ( 1319 unnormalisedVariance float64 1320 sumWeights float64 1321 ) 1322 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1323 return mean, unnormalisedVariance / (sumWeights - 1) 1324 } 1325 1326 // PopMeanVariance computes the sample mean and biased variance (also known as 1327 // "population variance"), where the mean and variance are 1328 // 1329 // \sum_i w_i * x_i / (sum_i w_i) 1330 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1331 // 1332 // respectively. 1333 // If weights is nil then all of the weights are 1. If weights is not nil, then 1334 // len(x) must equal len(weights). 1335 func PopMeanVariance(x, weights []float64) (mean, variance float64) { 1336 var ( 1337 unnormalisedVariance float64 1338 sumWeights float64 1339 ) 1340 mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights) 1341 return mean, unnormalisedVariance / sumWeights 1342 } 1343 1344 // PopMeanStdDev returns the sample mean and biased standard deviation 1345 // (also known as "population standard deviation"). 1346 func PopMeanStdDev(x, weights []float64) (mean, std float64) { 1347 mean, variance := PopMeanVariance(x, weights) 1348 return mean, math.Sqrt(variance) 1349 } 1350 1351 // PopStdDev returns the population standard deviation, i.e., a square root 1352 // of the biased variance estimate. 1353 func PopStdDev(x, weights []float64) float64 { 1354 _, stDev := PopMeanStdDev(x, weights) 1355 return stDev 1356 } 1357 1358 // PopVariance computes the unbiased weighted sample variance: 1359 // 1360 // \sum_i w_i (x_i - mean)^2 / (sum_i w_i) 1361 // 1362 // If weights is nil then all of the weights are 1. If weights is not nil, then 1363 // len(x) must equal len(weights). 1364 func PopVariance(x, weights []float64) float64 { 1365 _, variance := PopMeanVariance(x, weights) 1366 return variance 1367 } 1368 1369 func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) { 1370 // This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing 1371 // the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub, 1372 // and Randall J. LeVeque. 1373 1374 // Note that this will panic if the slice lengths do not match. 1375 mean = Mean(x, weights) 1376 var ( 1377 ss float64 1378 compensation float64 1379 ) 1380 if weights == nil { 1381 for _, v := range x { 1382 d := v - mean 1383 ss += d * d 1384 compensation += d 1385 } 1386 unnormalisedVariance = (ss - compensation*compensation/float64(len(x))) 1387 return mean, unnormalisedVariance, float64(len(x)) 1388 } 1389 1390 for i, v := range x { 1391 w := weights[i] 1392 d := v - mean 1393 wd := w * d 1394 ss += wd * d 1395 compensation += wd 1396 sumWeights += w 1397 } 1398 unnormalisedVariance = (ss - compensation*compensation/sumWeights) 1399 return mean, unnormalisedVariance, sumWeights 1400 }