github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/stats.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package workload
    12  
    13  import (
    14  	"math"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    18  )
    19  
    20  // AutoStatsName is copied from stats.AutoStatsName to avoid pulling
    21  // in a dependency on sql/stats.
    22  const AutoStatsName = "__auto__"
    23  
    24  // JSONStatistic is copied from stats.JSONStatistic to avoid pulling
    25  // in a dependency on sql/stats.
    26  type JSONStatistic struct {
    27  	Name          string   `json:"name,omitempty"`
    28  	CreatedAt     string   `json:"created_at"`
    29  	Columns       []string `json:"columns"`
    30  	RowCount      uint64   `json:"row_count"`
    31  	DistinctCount uint64   `json:"distinct_count"`
    32  	NullCount     uint64   `json:"null_count"`
    33  }
    34  
    35  // MakeStat returns a JSONStatistic given the column names, row count, distinct
    36  // count, and null count.
    37  func MakeStat(columns []string, rowCount, distinctCount, nullCount uint64) JSONStatistic {
    38  	return JSONStatistic{
    39  		Name:          AutoStatsName,
    40  		CreatedAt:     timeutil.Now().Round(time.Microsecond).UTC().Format(timestampOutputFormat),
    41  		Columns:       columns,
    42  		RowCount:      rowCount,
    43  		DistinctCount: distinctCount,
    44  		NullCount:     nullCount,
    45  	}
    46  }
    47  
    48  // DistinctCount returns the expected number of distinct values in a column
    49  // with rowCount rows, given that the values are chosen from maxDistinctCount
    50  // possible values using uniform random sampling with replacement.
    51  func DistinctCount(rowCount, maxDistinctCount uint64) uint64 {
    52  	n := float64(maxDistinctCount)
    53  	k := float64(rowCount)
    54  	// The probability that one specific value (out of the n possible values)
    55  	// does not appear in any of the k rows is:
    56  	//
    57  	//         ⎛ n-1 ⎞ k
    58  	//     p = ⎜-----⎟
    59  	//         ⎝  n  ⎠
    60  	//
    61  	// Therefore, the probability that a specific value appears at least once is
    62  	// 1-p. Over all n values, the expected number that appear at least once is
    63  	// n * (1-p). In other words, the expected distinct count is:
    64  	//
    65  	//                             ⎛     ⎛ n-1 ⎞ k ⎞
    66  	//     E[distinct count] = n * ⎜ 1 - ⎜-----⎟   ⎟
    67  	//                             ⎝     ⎝  n  ⎠   ⎠
    68  	//
    69  	// See https://math.stackexchange.com/questions/72223/finding-expected-
    70  	//   number-of-distinct-values-selected-from-a-set-of-integers for more info.
    71  	count := n * (1 - math.Pow((n-1)/n, k))
    72  	return uint64(int64(math.Round(count)))
    73  }