github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/builder.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"github.com/whtcorpsinc/errors"
    18  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    19  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    20  	"github.com/whtcorpsinc/milevadb/types"
    21  )
    22  
    23  // SortedBuilder is used to build histograms for PK and index.
    24  type SortedBuilder struct {
    25  	sc              *stmtctx.StatementContext
    26  	numBuckets      int64
    27  	valuesPerBucket int64
    28  	lastNumber      int64
    29  	bucketIdx       int64
    30  	Count           int64
    31  	hist            *Histogram
    32  }
    33  
    34  // NewSortedBuilder creates a new SortedBuilder.
    35  func NewSortedBuilder(sc *stmtctx.StatementContext, numBuckets, id int64, tp *types.FieldType) *SortedBuilder {
    36  	return &SortedBuilder{
    37  		sc:              sc,
    38  		numBuckets:      numBuckets,
    39  		valuesPerBucket: 1,
    40  		hist:            NewHistogram(id, 0, 0, 0, tp, int(numBuckets), 0),
    41  	}
    42  }
    43  
    44  // Hist returns the histogram built by SortedBuilder.
    45  func (b *SortedBuilder) Hist() *Histogram {
    46  	return b.hist
    47  }
    48  
    49  // Iterate uFIDelates the histogram incrementally.
    50  func (b *SortedBuilder) Iterate(data types.Causet) error {
    51  	b.Count++
    52  	if b.Count == 1 {
    53  		b.hist.AppendBucket(&data, &data, 1, 1)
    54  		b.hist.NDV = 1
    55  		return nil
    56  	}
    57  	cmp, err := b.hist.GetUpper(int(b.bucketIdx)).CompareCauset(b.sc, &data)
    58  	if err != nil {
    59  		return errors.Trace(err)
    60  	}
    61  	if cmp == 0 {
    62  		// The new item has the same value as current bucket value, to ensure that
    63  		// a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds
    64  		// valuesPerBucket.
    65  		b.hist.Buckets[b.bucketIdx].Count++
    66  		b.hist.Buckets[b.bucketIdx].Repeat++
    67  	} else if b.hist.Buckets[b.bucketIdx].Count+1-b.lastNumber <= b.valuesPerBucket {
    68  		// The bucket still have room to causetstore a new item, uFIDelate the bucket.
    69  		b.hist.uFIDelateLastBucket(&data, b.hist.Buckets[b.bucketIdx].Count+1, 1)
    70  		b.hist.NDV++
    71  	} else {
    72  		// All buckets are full, we should merge buckets.
    73  		if b.bucketIdx+1 == b.numBuckets {
    74  			b.hist.mergeBuckets(int(b.bucketIdx))
    75  			b.valuesPerBucket *= 2
    76  			b.bucketIdx = b.bucketIdx / 2
    77  			if b.bucketIdx == 0 {
    78  				b.lastNumber = 0
    79  			} else {
    80  				b.lastNumber = b.hist.Buckets[b.bucketIdx-1].Count
    81  			}
    82  		}
    83  		// We may merge buckets, so we should check it again.
    84  		if b.hist.Buckets[b.bucketIdx].Count+1-b.lastNumber <= b.valuesPerBucket {
    85  			b.hist.uFIDelateLastBucket(&data, b.hist.Buckets[b.bucketIdx].Count+1, 1)
    86  		} else {
    87  			b.lastNumber = b.hist.Buckets[b.bucketIdx].Count
    88  			b.bucketIdx++
    89  			b.hist.AppendBucket(&data, &data, b.lastNumber+1, 1)
    90  		}
    91  		b.hist.NDV++
    92  	}
    93  	return nil
    94  }
    95  
    96  // BuildDeferredCausetHist build a histogram for a column.
    97  // numBuckets: number of buckets for the histogram.
    98  // id: the id of the causet.
    99  // collector: the collector of samples.
   100  // tp: the FieldType for the column.
   101  // count: represents the event count for the column.
   102  // ndv: represents the number of distinct values for the column.
   103  // nullCount: represents the number of null values for the column.
   104  func BuildDeferredCausetHist(ctx stochastikctx.Context, numBuckets, id int64, collector *SampleDefCauslector, tp *types.FieldType, count int64, ndv int64, nullCount int64) (*Histogram, error) {
   105  	if ndv > count {
   106  		ndv = count
   107  	}
   108  	if count == 0 || len(collector.Samples) == 0 {
   109  		return NewHistogram(id, ndv, nullCount, 0, tp, 0, collector.TotalSize), nil
   110  	}
   111  	sc := ctx.GetStochastikVars().StmtCtx
   112  	samples := collector.Samples
   113  	samples, err := SortSampleItems(sc, samples)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  	hg := NewHistogram(id, ndv, nullCount, 0, tp, int(numBuckets), collector.TotalSize)
   118  
   119  	sampleNum := int64(len(samples))
   120  	// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
   121  	sampleFactor := float64(count) / float64(len(samples))
   122  	// Since bucket count is increased by sampleFactor, so the actual max values per bucket is
   123  	// floor(valuesPerBucket/sampleFactor)*sampleFactor, which may less than valuesPerBucket,
   124  	// thus we need to add a sampleFactor to avoid building too many buckets.
   125  	valuesPerBucket := float64(count)/float64(numBuckets) + sampleFactor
   126  	ndvFactor := float64(count) / float64(hg.NDV)
   127  	if ndvFactor > sampleFactor {
   128  		ndvFactor = sampleFactor
   129  	}
   130  	bucketIdx := 0
   131  	var lastCount int64
   132  	var corrXYSum float64
   133  	hg.AppendBucket(&samples[0].Value, &samples[0].Value, int64(sampleFactor), int64(ndvFactor))
   134  	for i := int64(1); i < sampleNum; i++ {
   135  		corrXYSum += float64(i) * float64(samples[i].Ordinal)
   136  		cmp, err := hg.GetUpper(bucketIdx).CompareCauset(sc, &samples[i].Value)
   137  		if err != nil {
   138  			return nil, errors.Trace(err)
   139  		}
   140  		totalCount := float64(i+1) * sampleFactor
   141  		if cmp == 0 {
   142  			// The new item has the same value as current bucket value, to ensure that
   143  			// a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds
   144  			// valuesPerBucket.
   145  			hg.Buckets[bucketIdx].Count = int64(totalCount)
   146  			if float64(hg.Buckets[bucketIdx].Repeat) == ndvFactor {
   147  				hg.Buckets[bucketIdx].Repeat = int64(2 * sampleFactor)
   148  			} else {
   149  				hg.Buckets[bucketIdx].Repeat += int64(sampleFactor)
   150  			}
   151  		} else if totalCount-float64(lastCount) <= valuesPerBucket {
   152  			// The bucket still have room to causetstore a new item, uFIDelate the bucket.
   153  			hg.uFIDelateLastBucket(&samples[i].Value, int64(totalCount), int64(ndvFactor))
   154  		} else {
   155  			lastCount = hg.Buckets[bucketIdx].Count
   156  			// The bucket is full, causetstore the item in the next bucket.
   157  			bucketIdx++
   158  			hg.AppendBucket(&samples[i].Value, &samples[i].Value, int64(totalCount), int64(ndvFactor))
   159  		}
   160  	}
   161  	// Compute column order correlation with handle.
   162  	if sampleNum == 1 {
   163  		hg.Correlation = 1
   164  		return hg, nil
   165  	}
   166  	// X means the ordinal of the item in original sequence, Y means the oridnal of the item in the
   167  	// sorted sequence, we know that X and Y value sets are both:
   168  	// 0, 1, ..., sampleNum-1
   169  	// we can simply compute sum(X) = sum(Y) =
   170  	//    (sampleNum-1)*sampleNum / 2
   171  	// and sum(X^2) = sum(Y^2) =
   172  	//    (sampleNum-1)*sampleNum*(2*sampleNum-1) / 6
   173  	// We use "Pearson correlation coefficient" to compute the order correlation of columns,
   174  	// the formula is based on https://en.wikipedia.org/wiki/Pearson_correlation_coefficient.
   175  	// Note that (itemsCount*corrX2Sum - corrXSum*corrXSum) would never be zero when sampleNum is larger than 1.
   176  	itemsCount := float64(sampleNum)
   177  	corrXSum := (itemsCount - 1) * itemsCount / 2.0
   178  	corrX2Sum := (itemsCount - 1) * itemsCount * (2*itemsCount - 1) / 6.0
   179  	hg.Correlation = (itemsCount*corrXYSum - corrXSum*corrXSum) / (itemsCount*corrX2Sum - corrXSum*corrXSum)
   180  	return hg, nil
   181  }
   182  
   183  // BuildDeferredCauset builds histogram from samples for column.
   184  func BuildDeferredCauset(ctx stochastikctx.Context, numBuckets, id int64, collector *SampleDefCauslector, tp *types.FieldType) (*Histogram, error) {
   185  	return BuildDeferredCausetHist(ctx, numBuckets, id, collector, tp, collector.Count, collector.FMSketch.NDV(), collector.NullCount)
   186  }