github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/builder.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "github.com/whtcorpsinc/errors" 18 "github.com/whtcorpsinc/milevadb/stochastikctx" 19 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 20 "github.com/whtcorpsinc/milevadb/types" 21 ) 22 23 // SortedBuilder is used to build histograms for PK and index. 24 type SortedBuilder struct { 25 sc *stmtctx.StatementContext 26 numBuckets int64 27 valuesPerBucket int64 28 lastNumber int64 29 bucketIdx int64 30 Count int64 31 hist *Histogram 32 } 33 34 // NewSortedBuilder creates a new SortedBuilder. 35 func NewSortedBuilder(sc *stmtctx.StatementContext, numBuckets, id int64, tp *types.FieldType) *SortedBuilder { 36 return &SortedBuilder{ 37 sc: sc, 38 numBuckets: numBuckets, 39 valuesPerBucket: 1, 40 hist: NewHistogram(id, 0, 0, 0, tp, int(numBuckets), 0), 41 } 42 } 43 44 // Hist returns the histogram built by SortedBuilder. 45 func (b *SortedBuilder) Hist() *Histogram { 46 return b.hist 47 } 48 49 // Iterate uFIDelates the histogram incrementally. 50 func (b *SortedBuilder) Iterate(data types.Causet) error { 51 b.Count++ 52 if b.Count == 1 { 53 b.hist.AppendBucket(&data, &data, 1, 1) 54 b.hist.NDV = 1 55 return nil 56 } 57 cmp, err := b.hist.GetUpper(int(b.bucketIdx)).CompareCauset(b.sc, &data) 58 if err != nil { 59 return errors.Trace(err) 60 } 61 if cmp == 0 { 62 // The new item has the same value as current bucket value, to ensure that 63 // a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds 64 // valuesPerBucket. 65 b.hist.Buckets[b.bucketIdx].Count++ 66 b.hist.Buckets[b.bucketIdx].Repeat++ 67 } else if b.hist.Buckets[b.bucketIdx].Count+1-b.lastNumber <= b.valuesPerBucket { 68 // The bucket still have room to causetstore a new item, uFIDelate the bucket. 69 b.hist.uFIDelateLastBucket(&data, b.hist.Buckets[b.bucketIdx].Count+1, 1) 70 b.hist.NDV++ 71 } else { 72 // All buckets are full, we should merge buckets. 73 if b.bucketIdx+1 == b.numBuckets { 74 b.hist.mergeBuckets(int(b.bucketIdx)) 75 b.valuesPerBucket *= 2 76 b.bucketIdx = b.bucketIdx / 2 77 if b.bucketIdx == 0 { 78 b.lastNumber = 0 79 } else { 80 b.lastNumber = b.hist.Buckets[b.bucketIdx-1].Count 81 } 82 } 83 // We may merge buckets, so we should check it again. 84 if b.hist.Buckets[b.bucketIdx].Count+1-b.lastNumber <= b.valuesPerBucket { 85 b.hist.uFIDelateLastBucket(&data, b.hist.Buckets[b.bucketIdx].Count+1, 1) 86 } else { 87 b.lastNumber = b.hist.Buckets[b.bucketIdx].Count 88 b.bucketIdx++ 89 b.hist.AppendBucket(&data, &data, b.lastNumber+1, 1) 90 } 91 b.hist.NDV++ 92 } 93 return nil 94 } 95 96 // BuildDeferredCausetHist build a histogram for a column. 97 // numBuckets: number of buckets for the histogram. 98 // id: the id of the causet. 99 // collector: the collector of samples. 100 // tp: the FieldType for the column. 101 // count: represents the event count for the column. 102 // ndv: represents the number of distinct values for the column. 103 // nullCount: represents the number of null values for the column. 104 func BuildDeferredCausetHist(ctx stochastikctx.Context, numBuckets, id int64, collector *SampleDefCauslector, tp *types.FieldType, count int64, ndv int64, nullCount int64) (*Histogram, error) { 105 if ndv > count { 106 ndv = count 107 } 108 if count == 0 || len(collector.Samples) == 0 { 109 return NewHistogram(id, ndv, nullCount, 0, tp, 0, collector.TotalSize), nil 110 } 111 sc := ctx.GetStochastikVars().StmtCtx 112 samples := collector.Samples 113 samples, err := SortSampleItems(sc, samples) 114 if err != nil { 115 return nil, err 116 } 117 hg := NewHistogram(id, ndv, nullCount, 0, tp, int(numBuckets), collector.TotalSize) 118 119 sampleNum := int64(len(samples)) 120 // As we use samples to build the histogram, the bucket number and repeat should multiply a factor. 121 sampleFactor := float64(count) / float64(len(samples)) 122 // Since bucket count is increased by sampleFactor, so the actual max values per bucket is 123 // floor(valuesPerBucket/sampleFactor)*sampleFactor, which may less than valuesPerBucket, 124 // thus we need to add a sampleFactor to avoid building too many buckets. 125 valuesPerBucket := float64(count)/float64(numBuckets) + sampleFactor 126 ndvFactor := float64(count) / float64(hg.NDV) 127 if ndvFactor > sampleFactor { 128 ndvFactor = sampleFactor 129 } 130 bucketIdx := 0 131 var lastCount int64 132 var corrXYSum float64 133 hg.AppendBucket(&samples[0].Value, &samples[0].Value, int64(sampleFactor), int64(ndvFactor)) 134 for i := int64(1); i < sampleNum; i++ { 135 corrXYSum += float64(i) * float64(samples[i].Ordinal) 136 cmp, err := hg.GetUpper(bucketIdx).CompareCauset(sc, &samples[i].Value) 137 if err != nil { 138 return nil, errors.Trace(err) 139 } 140 totalCount := float64(i+1) * sampleFactor 141 if cmp == 0 { 142 // The new item has the same value as current bucket value, to ensure that 143 // a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds 144 // valuesPerBucket. 145 hg.Buckets[bucketIdx].Count = int64(totalCount) 146 if float64(hg.Buckets[bucketIdx].Repeat) == ndvFactor { 147 hg.Buckets[bucketIdx].Repeat = int64(2 * sampleFactor) 148 } else { 149 hg.Buckets[bucketIdx].Repeat += int64(sampleFactor) 150 } 151 } else if totalCount-float64(lastCount) <= valuesPerBucket { 152 // The bucket still have room to causetstore a new item, uFIDelate the bucket. 153 hg.uFIDelateLastBucket(&samples[i].Value, int64(totalCount), int64(ndvFactor)) 154 } else { 155 lastCount = hg.Buckets[bucketIdx].Count 156 // The bucket is full, causetstore the item in the next bucket. 157 bucketIdx++ 158 hg.AppendBucket(&samples[i].Value, &samples[i].Value, int64(totalCount), int64(ndvFactor)) 159 } 160 } 161 // Compute column order correlation with handle. 162 if sampleNum == 1 { 163 hg.Correlation = 1 164 return hg, nil 165 } 166 // X means the ordinal of the item in original sequence, Y means the oridnal of the item in the 167 // sorted sequence, we know that X and Y value sets are both: 168 // 0, 1, ..., sampleNum-1 169 // we can simply compute sum(X) = sum(Y) = 170 // (sampleNum-1)*sampleNum / 2 171 // and sum(X^2) = sum(Y^2) = 172 // (sampleNum-1)*sampleNum*(2*sampleNum-1) / 6 173 // We use "Pearson correlation coefficient" to compute the order correlation of columns, 174 // the formula is based on https://en.wikipedia.org/wiki/Pearson_correlation_coefficient. 175 // Note that (itemsCount*corrX2Sum - corrXSum*corrXSum) would never be zero when sampleNum is larger than 1. 176 itemsCount := float64(sampleNum) 177 corrXSum := (itemsCount - 1) * itemsCount / 2.0 178 corrX2Sum := (itemsCount - 1) * itemsCount * (2*itemsCount - 1) / 6.0 179 hg.Correlation = (itemsCount*corrXYSum - corrXSum*corrXSum) / (itemsCount*corrX2Sum - corrXSum*corrXSum) 180 return hg, nil 181 } 182 183 // BuildDeferredCauset builds histogram from samples for column. 184 func BuildDeferredCauset(ctx stochastikctx.Context, numBuckets, id int64, collector *SampleDefCauslector, tp *types.FieldType) (*Histogram, error) { 185 return BuildDeferredCausetHist(ctx, numBuckets, id, collector, tp, collector.Count, collector.FMSketch.NDV(), collector.NullCount) 186 }