github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/findflakes/adtest.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  	"sort"
    12  )
    13  
    14  var (
    15  	ErrSampleSize = errors.New("sample is too small")
    16  )
    17  
    18  type SampleValueError struct {
    19  	Value  int
    20  	Detail string
    21  }
    22  
    23  func (e *SampleValueError) Error() string {
    24  	return e.Detail
    25  }
    26  
    27  type AndersonDarlingTestResult struct {
    28  	// A2 is the Anderson-Darling test statistic, A², for the
    29  	// goodness of fit of the sample to the probability
    30  	// distribution.
    31  	A2 float64
    32  
    33  	// P is the p-value for this test. A small value of P
    34  	// indicates a significant difference between the sample and
    35  	// the distribution.
    36  	P float64
    37  }
    38  
    39  // AndersonDarlingTest performs an Anderson-Darling goodness-of-fit
    40  // test for whether a sample comes from a population with a specified
    41  // distribution. It tests the null hypothesis that sample follows dist
    42  // against the alternate hypothesis that sample does not follow dist.
    43  //
    44  // Note that this uses a Monte Carlo method (parametric bootstrap) to
    45  // estimate the distribution of the test statistic and hence the exact
    46  // P value may vary slightly between calls with the same sample and
    47  // distribution.
    48  func AndersonDarlingTest(sample []int, dist *GeometricDist) (*AndersonDarlingTestResult, error) {
    49  	if len(sample) == 0 {
    50  		return nil, ErrSampleSize
    51  	}
    52  
    53  	if !sort.IntsAreSorted(sample) {
    54  		sample = append([]int(nil), sample...)
    55  		sort.Ints(sample)
    56  	}
    57  
    58  	A2, err := andersonDarling(sample, dist)
    59  	if err != nil {
    60  		return nil, err
    61  	}
    62  
    63  	// Use parametric bootstrap to estimate the distribution of
    64  	// A².
    65  	const resamples = 1000
    66  	nsample := make([]int, len(sample))
    67  	ngreater := 0
    68  	for i := 0; i < resamples; i++ {
    69  		for j := range nsample {
    70  			nsample[j] = dist.Rand()
    71  		}
    72  		sort.Ints(nsample)
    73  		nA2, err := andersonDarling(nsample, dist)
    74  		if err != nil {
    75  			return nil, err
    76  		}
    77  		if nA2 >= A2 {
    78  			ngreater++
    79  		}
    80  	}
    81  	p := float64(ngreater) / resamples
    82  
    83  	return &AndersonDarlingTestResult{A2, p}, nil
    84  }
    85  
    86  // andersonDarling returns the Anderson-Darling test statistic, A²,
    87  // for the goodness of fit of sample to dist.
    88  //
    89  // sample must be sorted.
    90  func andersonDarling(sample []int, dist *GeometricDist) (float64, error) {
    91  	sum := 0.0
    92  	// TODO: Rearrange terms so we don't have to compute each
    93  	// sample's CDF twice.
    94  	for i, y1 := range sample {
    95  		y2 := sample[len(sample)-i-1]
    96  		cdf1, sf2 := dist.CDF(y1), dist.SF(y2)
    97  		if cdf1 == 0 {
    98  			return 0, &SampleValueError{
    99  				Value:  y1,
   100  				Detail: fmt.Sprintf("sample %d lies outside support of expected distribution %v", y1, dist),
   101  			}
   102  		}
   103  		if sf2 == 0 {
   104  			return 0, &SampleValueError{
   105  				Value:  y2,
   106  				Detail: fmt.Sprintf("sample %d lies outside support of expected distribution %v", y2, dist),
   107  			}
   108  		}
   109  		sum += float64(2*i-1) * (math.Log(cdf1) + math.Log(sf2))
   110  	}
   111  	return -float64(len(sample)) - sum/float64(len(sample)), nil
   112  }