github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/estimate.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"math"
    18  
    19  	"github.com/cznic/mathutil"
    20  )
    21  
    22  // calculateEstimateNDV calculates the estimate ndv of a sampled data from a multisize with size total.
    23  func calculateEstimateNDV(h *topNHelper, rowCount uint64) (ndv uint64, scaleRatio uint64) {
    24  	sampleSize, sampleNDV, onlyOnceItems := h.sampleSize, uint64(len(h.sorted)), h.onlyOnceItems
    25  	scaleRatio = rowCount / sampleSize
    26  
    27  	if onlyOnceItems == sampleSize {
    28  		// Assume this is a unique column, so do not scale up the count of elements
    29  		return rowCount, 1
    30  	} else if onlyOnceItems == 0 {
    31  		// Assume data only consists of sampled data
    32  		// Nothing to do, no change with scale ratio
    33  		return sampleNDV, scaleRatio
    34  	}
    35  	// Charikar, Moses, et al. "Towards estimation error guarantees for distinct values."
    36  	// Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2000.
    37  	// This is GEE in that paper.
    38  	// estimateNDV = sqrt(N/n) f_1 + sum_2..inf f_i
    39  	// f_i = number of elements occurred i times in sample
    40  
    41  	f1 := float64(onlyOnceItems)
    42  	n := float64(sampleSize)
    43  	N := float64(rowCount)
    44  	d := float64(sampleNDV)
    45  
    46  	ndv = uint64(math.Sqrt(N/n)*f1 + d - f1 + 0.5)
    47  	ndv = mathutil.MaxUint64(ndv, sampleNDV)
    48  	ndv = mathutil.MinUint64(ndv, rowCount)
    49  	return ndv, scaleRatio
    50  }