github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/estimate.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "math" 18 19 "github.com/cznic/mathutil" 20 ) 21 22 // calculateEstimateNDV calculates the estimate ndv of a sampled data from a multisize with size total. 23 func calculateEstimateNDV(h *topNHelper, rowCount uint64) (ndv uint64, scaleRatio uint64) { 24 sampleSize, sampleNDV, onlyOnceItems := h.sampleSize, uint64(len(h.sorted)), h.onlyOnceItems 25 scaleRatio = rowCount / sampleSize 26 27 if onlyOnceItems == sampleSize { 28 // Assume this is a unique column, so do not scale up the count of elements 29 return rowCount, 1 30 } else if onlyOnceItems == 0 { 31 // Assume data only consists of sampled data 32 // Nothing to do, no change with scale ratio 33 return sampleNDV, scaleRatio 34 } 35 // Charikar, Moses, et al. "Towards estimation error guarantees for distinct values." 36 // Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2000. 37 // This is GEE in that paper. 38 // estimateNDV = sqrt(N/n) f_1 + sum_2..inf f_i 39 // f_i = number of elements occurred i times in sample 40 41 f1 := float64(onlyOnceItems) 42 n := float64(sampleSize) 43 N := float64(rowCount) 44 d := float64(sampleNDV) 45 46 ndv = uint64(math.Sqrt(N/n)*f1 + d - f1 + 0.5) 47 ndv = mathutil.MaxUint64(ndv, sampleNDV) 48 ndv = mathutil.MinUint64(ndv, rowCount) 49 return ndv, scaleRatio 50 }