github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/stats.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package workload 12 13 import ( 14 "math" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 18 ) 19 20 // AutoStatsName is copied from stats.AutoStatsName to avoid pulling 21 // in a dependency on sql/stats. 22 const AutoStatsName = "__auto__" 23 24 // JSONStatistic is copied from stats.JSONStatistic to avoid pulling 25 // in a dependency on sql/stats. 26 type JSONStatistic struct { 27 Name string `json:"name,omitempty"` 28 CreatedAt string `json:"created_at"` 29 Columns []string `json:"columns"` 30 RowCount uint64 `json:"row_count"` 31 DistinctCount uint64 `json:"distinct_count"` 32 NullCount uint64 `json:"null_count"` 33 } 34 35 // MakeStat returns a JSONStatistic given the column names, row count, distinct 36 // count, and null count. 37 func MakeStat(columns []string, rowCount, distinctCount, nullCount uint64) JSONStatistic { 38 return JSONStatistic{ 39 Name: AutoStatsName, 40 CreatedAt: timeutil.Now().Round(time.Microsecond).UTC().Format(timestampOutputFormat), 41 Columns: columns, 42 RowCount: rowCount, 43 DistinctCount: distinctCount, 44 NullCount: nullCount, 45 } 46 } 47 48 // DistinctCount returns the expected number of distinct values in a column 49 // with rowCount rows, given that the values are chosen from maxDistinctCount 50 // possible values using uniform random sampling with replacement. 51 func DistinctCount(rowCount, maxDistinctCount uint64) uint64 { 52 n := float64(maxDistinctCount) 53 k := float64(rowCount) 54 // The probability that one specific value (out of the n possible values) 55 // does not appear in any of the k rows is: 56 // 57 // ⎛ n-1 ⎞ k 58 // p = ⎜-----⎟ 59 // ⎝ n ⎠ 60 // 61 // Therefore, the probability that a specific value appears at least once is 62 // 1-p. Over all n values, the expected number that appear at least once is 63 // n * (1-p). In other words, the expected distinct count is: 64 // 65 // ⎛ ⎛ n-1 ⎞ k ⎞ 66 // E[distinct count] = n * ⎜ 1 - ⎜-----⎟ ⎟ 67 // ⎝ ⎝ n ⎠ ⎠ 68 // 69 // See https://math.stackexchange.com/questions/72223/finding-expected- 70 // number-of-distinct-values-selected-from-a-set-of-integers for more info. 71 count := n * (1 - math.Pow((n-1)/n, k)) 72 return uint64(int64(math.Round(count))) 73 }