github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/feedback.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"bytes"
    18  	"encoding/gob"
    19  	"math"
    20  	"math/rand"
    21  	"sort"
    22  	"time"
    23  
    24  	"github.com/cznic/mathutil"
    25  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    26  	"github.com/whtcorpsinc/errors"
    27  	"github.com/whtcorpsinc/log"
    28  	"github.com/whtcorpsinc/milevadb/blockcodec"
    29  	"github.com/whtcorpsinc/milevadb/ekv"
    30  	"github.com/whtcorpsinc/milevadb/metrics"
    31  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    32  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    33  	"github.com/whtcorpsinc/milevadb/soliton/logutil"
    34  	"github.com/whtcorpsinc/milevadb/soliton/ranger"
    35  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    36  	"github.com/whtcorpsinc/milevadb/types"
    37  	"go.uber.org/atomic"
    38  	"go.uber.org/zap"
    39  )
    40  
    41  // Feedback represents the total scan count in range [lower, upper).
    42  type Feedback struct {
    43  	Lower  *types.Causet
    44  	Upper  *types.Causet
    45  	Count  int64
    46  	Repeat int64
    47  }
    48  
    49  // QueryFeedback is used to represent the query feedback info. It contains the query's scan ranges and number of rows
    50  // in each range.
    51  type QueryFeedback struct {
    52  	PhysicalID int64
    53  	Hist       *Histogram
    54  	Tp         int
    55  	Feedback   []Feedback
    56  	Expected   int64 // Expected is the Expected scan count of corresponding query.
    57  	actual     int64 // actual is the actual scan count of corresponding query.
    58  	Valid      bool  // Valid represents the whether this query feedback is still Valid.
    59  	desc       bool  // desc represents the corresponding query is desc scan.
    60  }
    61  
    62  // NewQueryFeedback returns a new query feedback.
    63  func NewQueryFeedback(physicalID int64, hist *Histogram, expected int64, desc bool) *QueryFeedback {
    64  	if hist != nil && hist.Len() == 0 {
    65  		hist = nil
    66  	}
    67  	tp := PkType
    68  	if hist != nil && hist.IsIndexHist() {
    69  		tp = IndexType
    70  	}
    71  	return &QueryFeedback{
    72  		PhysicalID: physicalID,
    73  		Valid:      true,
    74  		Tp:         tp,
    75  		Hist:       hist,
    76  		Expected:   expected,
    77  		desc:       desc,
    78  	}
    79  }
    80  
    81  // QueryFeedbackKey is the key for a group of feedbacks on the same index/column.
    82  type QueryFeedbackKey struct {
    83  	PhysicalID int64
    84  	HistID     int64
    85  	Tp         int
    86  }
    87  
    88  // QueryFeedbackMap is the collection of feedbacks.
    89  type QueryFeedbackMap struct {
    90  	Size      int
    91  	Feedbacks map[QueryFeedbackKey][]*QueryFeedback
    92  }
    93  
    94  // NewQueryFeedbackMap builds a feedback collection.
    95  func NewQueryFeedbackMap() *QueryFeedbackMap {
    96  	return &QueryFeedbackMap{Feedbacks: make(map[QueryFeedbackKey][]*QueryFeedback)}
    97  }
    98  
    99  // Append adds a feedback into map.
   100  func (m *QueryFeedbackMap) Append(q *QueryFeedback) {
   101  	k := QueryFeedbackKey{
   102  		PhysicalID: q.PhysicalID,
   103  		HistID:     q.Hist.ID,
   104  		Tp:         q.Tp,
   105  	}
   106  	m.append(k, []*QueryFeedback{q})
   107  	return
   108  }
   109  
   110  // MaxQueryFeedbackCount is the max number of feedbacks that are cached in memory.
   111  var MaxQueryFeedbackCount = atomic.NewInt64(1 << 9)
   112  
   113  func (m *QueryFeedbackMap) append(k QueryFeedbackKey, qs []*QueryFeedback) bool {
   114  	remained := MaxQueryFeedbackCount.Load() - int64(m.Size)
   115  	if remained <= 0 {
   116  		return false
   117  	}
   118  	s, ok := m.Feedbacks[k]
   119  	if !ok || s == nil {
   120  		s = make([]*QueryFeedback, 0, 8)
   121  	}
   122  	l := mathutil.MinInt64(int64(len(qs)), remained)
   123  	s = append(s, qs[:l]...)
   124  	m.Feedbacks[k] = s
   125  	m.Size = m.Size + int(l)
   126  	return true
   127  }
   128  
   129  // Merge combines 2 collections of feedbacks.
   130  func (m *QueryFeedbackMap) Merge(r *QueryFeedbackMap) {
   131  	for k, qs := range r.Feedbacks {
   132  		if !m.append(k, qs) {
   133  			break
   134  		}
   135  	}
   136  	return
   137  }
   138  
   139  var (
   140  	// MaxNumberOfRanges is the max number of ranges before split to collect feedback.
   141  	MaxNumberOfRanges = 20
   142  	// FeedbackProbability is the probability to collect the feedback.
   143  	FeedbackProbability = atomic.NewFloat64(0)
   144  )
   145  
   146  // CalcErrorRate calculates the error rate the current QueryFeedback.
   147  func (q *QueryFeedback) CalcErrorRate() float64 {
   148  	expected := float64(q.Expected)
   149  	if q.actual == 0 {
   150  		if expected == 0 {
   151  			return 0
   152  		}
   153  		return 1
   154  	}
   155  	return math.Abs(expected-float64(q.actual)) / float64(q.actual)
   156  }
   157  
   158  // DefCauslectFeedback decides whether to collect the feedback. It returns false when:
   159  // 1: the feedback is not generated by select query;
   160  // 2: the histogram is nil or has no buckets;
   161  // 3: the number of scan ranges exceeds the limit because it may affect the performance;
   162  // 4: it does not pass the probabilistic sampler.
   163  func DefCauslectFeedback(sc *stmtctx.StatementContext, q *QueryFeedback, numOfRanges int) bool {
   164  	if !sc.InSelectStmt {
   165  		return false
   166  	}
   167  	if q.Hist == nil || q.Hist.Len() == 0 {
   168  		return false
   169  	}
   170  	if numOfRanges > MaxNumberOfRanges || rand.Float64() > FeedbackProbability.Load() {
   171  		return false
   172  	}
   173  	return true
   174  }
   175  
   176  // DecodeToRanges decode the feedback to ranges.
   177  func (q *QueryFeedback) DecodeToRanges(isIndex bool) ([]*ranger.Range, error) {
   178  	ranges := make([]*ranger.Range, 0, len(q.Feedback))
   179  	for _, val := range q.Feedback {
   180  		low, high := *val.Lower, *val.Upper
   181  		var lowVal, highVal []types.Causet
   182  		if isIndex {
   183  			var err error
   184  			// As we do not know the origin length, just use a custom value here.
   185  			lowVal, _, err = codec.DecodeRange(low.GetBytes(), 4, nil, nil)
   186  			if err != nil {
   187  				return nil, errors.Trace(err)
   188  			}
   189  			highVal, _, err = codec.DecodeRange(high.GetBytes(), 4, nil, nil)
   190  			if err != nil {
   191  				return nil, errors.Trace(err)
   192  			}
   193  		} else {
   194  			_, lowInt, err := codec.DecodeInt(val.Lower.GetBytes())
   195  			if err != nil {
   196  				return nil, errors.Trace(err)
   197  			}
   198  			_, highInt, err := codec.DecodeInt(val.Upper.GetBytes())
   199  			if err != nil {
   200  				return nil, errors.Trace(err)
   201  			}
   202  			lowVal = []types.Causet{types.NewIntCauset(lowInt)}
   203  			highVal = []types.Causet{types.NewIntCauset(highInt)}
   204  		}
   205  		ranges = append(ranges, &(ranger.Range{
   206  			LowVal:      lowVal,
   207  			HighVal:     highVal,
   208  			HighExclude: true,
   209  		}))
   210  	}
   211  	return ranges, nil
   212  }
   213  
   214  // DecodeIntValues is called when the current Feedback stores encoded int values.
   215  func (q *QueryFeedback) DecodeIntValues() *QueryFeedback {
   216  	nq := &QueryFeedback{}
   217  	nq.Feedback = make([]Feedback, 0, len(q.Feedback))
   218  	for _, fb := range q.Feedback {
   219  		_, lowInt, err := codec.DecodeInt(fb.Lower.GetBytes())
   220  		if err != nil {
   221  			logutil.BgLogger().Debug("decode feedback lower bound value to integer failed", zap.Binary("value", fb.Lower.GetBytes()), zap.Error(err))
   222  			continue
   223  		}
   224  		_, highInt, err := codec.DecodeInt(fb.Upper.GetBytes())
   225  		if err != nil {
   226  			logutil.BgLogger().Debug("decode feedback upper bound value to integer failed", zap.Binary("value", fb.Upper.GetBytes()), zap.Error(err))
   227  			continue
   228  		}
   229  		low, high := types.NewIntCauset(lowInt), types.NewIntCauset(highInt)
   230  		nq.Feedback = append(nq.Feedback, Feedback{Lower: &low, Upper: &high, Count: fb.Count})
   231  	}
   232  	return nq
   233  }
   234  
   235  // StoreRanges stores the ranges for uFIDelate.
   236  func (q *QueryFeedback) StoreRanges(ranges []*ranger.Range) {
   237  	q.Feedback = make([]Feedback, 0, len(ranges))
   238  	for _, ran := range ranges {
   239  		q.Feedback = append(q.Feedback, Feedback{&ran.LowVal[0], &ran.HighVal[0], 0, 0})
   240  	}
   241  }
   242  
   243  // Invalidate is used to invalidate the query feedback.
   244  func (q *QueryFeedback) Invalidate() {
   245  	q.Feedback = nil
   246  	q.Hist = nil
   247  	q.Valid = false
   248  	q.actual = -1
   249  }
   250  
   251  // Actual gets the actual event count.
   252  func (q *QueryFeedback) Actual() int64 {
   253  	if !q.Valid {
   254  		return -1
   255  	}
   256  	return q.actual
   257  }
   258  
   259  // UFIDelate uFIDelates the query feedback. `startKey` is the start scan key of the partial result, used to find
   260  // the range for uFIDelate. `counts` is the scan counts of each range, used to uFIDelate the feedback count info.
   261  func (q *QueryFeedback) UFIDelate(startKey ekv.Key, counts []int64) {
   262  	// Older versions do not have the counts info.
   263  	if len(counts) == 0 {
   264  		q.Invalidate()
   265  		return
   266  	}
   267  	sum := int64(0)
   268  	for _, count := range counts {
   269  		sum += count
   270  	}
   271  	metrics.DistALLEGROSQLScanKeysPartialHistogram.Observe(float64(sum))
   272  	q.actual += sum
   273  	if !q.Valid || q.Hist == nil {
   274  		return
   275  	}
   276  
   277  	if q.Tp == IndexType {
   278  		startKey = blockcodec.CutIndexPrefix(startKey)
   279  	} else {
   280  		startKey = blockcodec.CutRowKeyPrefix(startKey)
   281  	}
   282  	// Find the range that startKey falls in.
   283  	idx := sort.Search(len(q.Feedback), func(i int) bool {
   284  		return bytes.Compare(q.Feedback[i].Lower.GetBytes(), startKey) > 0
   285  	})
   286  	idx--
   287  	if idx < 0 {
   288  		return
   289  	}
   290  	// If the desc is true, the counts is reversed, so here we need to reverse it back.
   291  	if q.desc {
   292  		for i := 0; i < len(counts)/2; i++ {
   293  			j := len(counts) - i - 1
   294  			counts[i], counts[j] = counts[j], counts[i]
   295  		}
   296  	}
   297  	// UFIDelate the feedback count info.
   298  	for i, count := range counts {
   299  		if i+idx >= len(q.Feedback) {
   300  			q.Invalidate()
   301  			break
   302  		}
   303  		q.Feedback[i+idx].Count += count
   304  	}
   305  }
   306  
   307  // NonOverlappedFeedbacks extracts a set of feedbacks which are not overlapped with each other.
   308  func NonOverlappedFeedbacks(sc *stmtctx.StatementContext, fbs []Feedback) ([]Feedback, bool) {
   309  	// Sort feedbacks by end point and start point incrementally, then pick every feedback that is not overlapped
   310  	// with the previous chosen feedbacks.
   311  	var existsErr bool
   312  	sort.Slice(fbs, func(i, j int) bool {
   313  		res, err := fbs[i].Upper.CompareCauset(sc, fbs[j].Upper)
   314  		if err != nil {
   315  			existsErr = true
   316  		}
   317  		if existsErr || res != 0 {
   318  			return res < 0
   319  		}
   320  		res, err = fbs[i].Lower.CompareCauset(sc, fbs[j].Lower)
   321  		if err != nil {
   322  			existsErr = true
   323  		}
   324  		return res < 0
   325  	})
   326  	if existsErr {
   327  		return fbs, false
   328  	}
   329  	resFBs := make([]Feedback, 0, len(fbs))
   330  	previousEnd := &types.Causet{}
   331  	for _, fb := range fbs {
   332  		res, err := previousEnd.CompareCauset(sc, fb.Lower)
   333  		if err != nil {
   334  			return fbs, false
   335  		}
   336  		if res <= 0 {
   337  			resFBs = append(resFBs, fb)
   338  			previousEnd = fb.Upper
   339  		}
   340  	}
   341  	return resFBs, true
   342  }
   343  
   344  // BucketFeedback stands for all the feedback for a bucket.
   345  type BucketFeedback struct {
   346  	feedback []Feedback    // All the feedback info in the same bucket.
   347  	lower    *types.Causet // The lower bound of the new bucket.
   348  	upper    *types.Causet // The upper bound of the new bucket.
   349  }
   350  
   351  // outOfRange checks if the `val` is between `min` and `max`.
   352  func outOfRange(sc *stmtctx.StatementContext, min, max, val *types.Causet) (int, error) {
   353  	result, err := val.CompareCauset(sc, min)
   354  	if err != nil {
   355  		return 0, err
   356  	}
   357  	if result < 0 {
   358  		return result, nil
   359  	}
   360  	result, err = val.CompareCauset(sc, max)
   361  	if err != nil {
   362  		return 0, err
   363  	}
   364  	if result > 0 {
   365  		return result, nil
   366  	}
   367  	return 0, nil
   368  }
   369  
   370  // adjustFeedbackBoundaries adjust the feedback boundaries according to the `min` and `max`.
   371  // If the feedback has no intersection with `min` and `max`, we could just skip this feedback.
   372  func (f *Feedback) adjustFeedbackBoundaries(sc *stmtctx.StatementContext, min, max *types.Causet) (bool, error) {
   373  	result, err := outOfRange(sc, min, max, f.Lower)
   374  	if err != nil {
   375  		return false, err
   376  	}
   377  	if result > 0 {
   378  		return true, nil
   379  	}
   380  	if result < 0 {
   381  		f.Lower = min
   382  	}
   383  	result, err = outOfRange(sc, min, max, f.Upper)
   384  	if err != nil {
   385  		return false, err
   386  	}
   387  	if result < 0 {
   388  		return true, nil
   389  	}
   390  	if result > 0 {
   391  		f.Upper = max
   392  	}
   393  	return false, nil
   394  }
   395  
   396  // buildBucketFeedback build the feedback for each bucket from the histogram feedback.
   397  func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*BucketFeedback, int) {
   398  	bktID2FB := make(map[int]*BucketFeedback)
   399  	if len(feedback.Feedback) == 0 {
   400  		return bktID2FB, 0
   401  	}
   402  	total := 0
   403  	sc := &stmtctx.StatementContext{TimeZone: time.UTC}
   404  	min, max := types.GetMinValue(h.Tp), types.GetMaxValue(h.Tp)
   405  	for _, fb := range feedback.Feedback {
   406  		skip, err := fb.adjustFeedbackBoundaries(sc, &min, &max)
   407  		if err != nil {
   408  			logutil.BgLogger().Debug("adjust feedback boundaries failed", zap.Error(err))
   409  			continue
   410  		}
   411  		if skip {
   412  			continue
   413  		}
   414  		idx := h.Bounds.UpperBound(0, fb.Lower)
   415  		bktIdx := 0
   416  		// The last bucket also stores the feedback that falls outside the upper bound.
   417  		if idx >= h.Bounds.NumRows()-1 {
   418  			bktIdx = h.Len() - 1
   419  		} else if h.Len() == 1 {
   420  			bktIdx = 0
   421  		} else {
   422  			if idx == 0 {
   423  				bktIdx = 0
   424  			} else {
   425  				bktIdx = (idx - 1) / 2
   426  			}
   427  			// Make sure that this feedback lies within the bucket.
   428  			if chunk.Compare(h.Bounds.GetRow(2*(bktIdx+1)), 0, fb.Upper) < 0 {
   429  				continue
   430  			}
   431  		}
   432  		total++
   433  		bkt := bktID2FB[bktIdx]
   434  		if bkt == nil {
   435  			bkt = &BucketFeedback{lower: h.GetLower(bktIdx), upper: h.GetUpper(bktIdx)}
   436  			bktID2FB[bktIdx] = bkt
   437  		}
   438  		bkt.feedback = append(bkt.feedback, fb)
   439  		// UFIDelate the bound if necessary.
   440  		res, err := bkt.lower.CompareCauset(nil, fb.Lower)
   441  		if err != nil {
   442  			logutil.BgLogger().Debug("compare causet failed", zap.Any("value1", bkt.lower), zap.Any("value2", fb.Lower), zap.Error(err))
   443  			continue
   444  		}
   445  		if res > 0 {
   446  			bkt.lower = fb.Lower
   447  		}
   448  		res, err = bkt.upper.CompareCauset(nil, fb.Upper)
   449  		if err != nil {
   450  			logutil.BgLogger().Debug("compare causet failed", zap.Any("value1", bkt.upper), zap.Any("value2", fb.Upper), zap.Error(err))
   451  			continue
   452  		}
   453  		if res < 0 {
   454  			bkt.upper = fb.Upper
   455  		}
   456  	}
   457  	return bktID2FB, total
   458  }
   459  
   460  // getBoundaries gets the new boundaries after split.
   461  func (b *BucketFeedback) getBoundaries(num int) []types.Causet {
   462  	// Get all the possible new boundaries.
   463  	vals := make([]types.Causet, 0, len(b.feedback)*2+2)
   464  	for _, fb := range b.feedback {
   465  		vals = append(vals, *fb.Lower, *fb.Upper)
   466  	}
   467  	vals = append(vals, *b.lower)
   468  	err := types.SortCausets(nil, vals)
   469  	if err != nil {
   470  		logutil.BgLogger().Debug("sort datums failed", zap.Error(err))
   471  		return []types.Causet{*b.lower, *b.upper}
   472  	}
   473  	total, interval := 0, len(vals)/num
   474  	// Pick values per `interval`.
   475  	for i := 0; i < len(vals); i, total = i+interval, total+1 {
   476  		vals[total] = vals[i]
   477  	}
   478  	// Append the upper bound.
   479  	vals[total] = *b.upper
   480  	vals = vals[:total+1]
   481  	total = 1
   482  	// Erase the repeat values.
   483  	for i := 1; i < len(vals); i++ {
   484  		cmp, err := vals[total-1].CompareCauset(nil, &vals[i])
   485  		if err != nil {
   486  			logutil.BgLogger().Debug("compare causet failed", zap.Any("value1", vals[total-1]), zap.Any("value2", vals[i]), zap.Error(err))
   487  			continue
   488  		}
   489  		if cmp == 0 {
   490  			continue
   491  		}
   492  		vals[total] = vals[i]
   493  		total++
   494  	}
   495  	return vals[:total]
   496  }
   497  
   498  // There are only two types of causet in bucket: one is `Blob`, which is for index; the other one
   499  // is `Int`, which is for primary key.
   500  type bucket = Feedback
   501  
   502  // splitBucket firstly splits this "BucketFeedback" to "newNumBkts" new buckets,
   503  // calculates the count for each new bucket, merge the new bucket whose count
   504  // is smaller than "minBucketFraction*totalCount" with the next new bucket
   505  // until the last new bucket.
   506  func (b *BucketFeedback) splitBucket(newNumBkts int, totalCount float64, originBucketCount float64) []bucket {
   507  	// Split the bucket.
   508  	bounds := b.getBoundaries(newNumBkts + 1)
   509  	bkts := make([]bucket, 0, len(bounds)-1)
   510  	sc := &stmtctx.StatementContext{TimeZone: time.UTC}
   511  	for i := 1; i < len(bounds); i++ {
   512  		newBkt := bucket{&bounds[i-1], bounds[i].Clone(), 0, 0}
   513  		// get bucket count
   514  		_, ratio := getOverlapFraction(Feedback{b.lower, b.upper, int64(originBucketCount), 0}, newBkt)
   515  		countInNewBkt := originBucketCount * ratio
   516  		countInNewBkt = b.refineBucketCount(sc, newBkt, countInNewBkt)
   517  		// do not split if the count of result bucket is too small.
   518  		if countInNewBkt < minBucketFraction*totalCount {
   519  			bounds[i] = bounds[i-1]
   520  			continue
   521  		}
   522  		newBkt.Count = int64(countInNewBkt)
   523  		bkts = append(bkts, newBkt)
   524  		// To guarantee that each bucket's range will not overlap.
   525  		setNextValue(&bounds[i])
   526  	}
   527  	return bkts
   528  }
   529  
   530  // getOverlapFraction gets the overlap fraction of feedback and bucket range. In order to get the bucket count, it also
   531  // returns the ratio between bucket fraction and feedback fraction.
   532  func getOverlapFraction(fb Feedback, bkt bucket) (float64, float64) {
   533  	datums := make([]types.Causet, 0, 4)
   534  	datums = append(datums, *fb.Lower, *fb.Upper)
   535  	datums = append(datums, *bkt.Lower, *bkt.Upper)
   536  	err := types.SortCausets(nil, datums)
   537  	if err != nil {
   538  		return 0, 0
   539  	}
   540  	minValue, maxValue := &datums[0], &datums[3]
   541  	fbLower := calcFraction4Causets(minValue, maxValue, fb.Lower)
   542  	fbUpper := calcFraction4Causets(minValue, maxValue, fb.Upper)
   543  	bktLower := calcFraction4Causets(minValue, maxValue, bkt.Lower)
   544  	bktUpper := calcFraction4Causets(minValue, maxValue, bkt.Upper)
   545  	ratio := (bktUpper - bktLower) / (fbUpper - fbLower)
   546  	// full overlap
   547  	if fbLower <= bktLower && bktUpper <= fbUpper {
   548  		return bktUpper - bktLower, ratio
   549  	}
   550  	if bktLower <= fbLower && fbUpper <= bktUpper {
   551  		return fbUpper - fbLower, ratio
   552  	}
   553  	// partial overlap
   554  	overlap := math.Min(bktUpper-fbLower, fbUpper-bktLower)
   555  	return overlap, ratio
   556  }
   557  
   558  // mergeFullyContainedFeedback merges the max fraction of non-overlapped feedbacks that are fully contained in the bucket.
   559  func (b *BucketFeedback) mergeFullyContainedFeedback(sc *stmtctx.StatementContext, bkt bucket) (float64, float64, bool) {
   560  	feedbacks := make([]Feedback, 0, len(b.feedback))
   561  	// Get all the fully contained feedbacks.
   562  	for _, fb := range b.feedback {
   563  		res, err := outOfRange(sc, bkt.Lower, bkt.Upper, fb.Lower)
   564  		if res != 0 || err != nil {
   565  			return 0, 0, false
   566  		}
   567  		res, err = outOfRange(sc, bkt.Lower, bkt.Upper, fb.Upper)
   568  		if res != 0 || err != nil {
   569  			return 0, 0, false
   570  		}
   571  		feedbacks = append(feedbacks, fb)
   572  	}
   573  	if len(feedbacks) == 0 {
   574  		return 0, 0, false
   575  	}
   576  	sortedFBs, ok := NonOverlappedFeedbacks(sc, feedbacks)
   577  	if !ok {
   578  		return 0, 0, false
   579  	}
   580  	var sumFraction, sumCount float64
   581  	for _, fb := range sortedFBs {
   582  		fraction, _ := getOverlapFraction(fb, bkt)
   583  		sumFraction += fraction
   584  		sumCount += float64(fb.Count)
   585  	}
   586  	return sumFraction, sumCount, true
   587  }
   588  
   589  // refineBucketCount refine the newly split bucket count. It uses the feedback that overlaps most
   590  // with the bucket to get the bucket count.
   591  func (b *BucketFeedback) refineBucketCount(sc *stmtctx.StatementContext, bkt bucket, defaultCount float64) float64 {
   592  	bestFraction := minBucketFraction
   593  	count := defaultCount
   594  	sumFraction, sumCount, ok := b.mergeFullyContainedFeedback(sc, bkt)
   595  	if ok && sumFraction > bestFraction {
   596  		bestFraction = sumFraction
   597  		count = sumCount / sumFraction
   598  	}
   599  	for _, fb := range b.feedback {
   600  		fraction, ratio := getOverlapFraction(fb, bkt)
   601  		// choose the max overlap fraction
   602  		if fraction > bestFraction {
   603  			bestFraction = fraction
   604  			count = float64(fb.Count) * ratio
   605  		}
   606  	}
   607  	return count
   608  }
   609  
   610  const (
   611  	defaultSplitCount = 10
   612  	splitPerFeedback  = 10
   613  )
   614  
   615  // getSplitCount gets the split count for the histogram. It is based on the intuition that:
   616  // 1: If we have more remaining unused buckets, we can split more.
   617  // 2: We cannot split too aggressive, thus we make it split every `splitPerFeedback`.
   618  func getSplitCount(numFeedbacks, remainBuckets int) int {
   619  	// Split more if have more buckets available.
   620  	splitCount := mathutil.Max(remainBuckets, defaultSplitCount)
   621  	return mathutil.Min(splitCount, numFeedbacks/splitPerFeedback)
   622  }
   623  
   624  type bucketSembedded struct {
   625  	id        int
   626  	sembedded float64
   627  }
   628  
   629  type bucketSembeddeds []bucketSembedded
   630  
   631  func (bs bucketSembeddeds) Len() int           { return len(bs) }
   632  func (bs bucketSembeddeds) Swap(i, j int)      { bs[i], bs[j] = bs[j], bs[i] }
   633  func (bs bucketSembeddeds) Less(i, j int) bool { return bs[i].sembedded < bs[j].sembedded }
   634  
   635  const (
   636  	// To avoid the histogram been too imbalanced, we constrain the count of a bucket in range
   637  	// [minBucketFraction * totalCount, maxBucketFraction * totalCount].
   638  	minBucketFraction = 1 / 10000.0
   639  	maxBucketFraction = 1 / 10.0
   640  )
   641  
   642  // getBucketSembedded gets the sembedded for merge this bucket with previous one.
   643  // TODO: We also need to consider the bucket hit count.
   644  func getBucketSembedded(bkts []bucket, totalCount float64, id int) bucketSembedded {
   645  	preCount, count := float64(bkts[id-1].Count), float64(bkts[id].Count)
   646  	// do not merge if the result bucket is too large
   647  	if (preCount + count) > maxBucketFraction*totalCount {
   648  		return bucketSembedded{id, math.MaxFloat64}
   649  	}
   650  	// Merge them if the result bucket is already too small.
   651  	if (preCount + count) < minBucketFraction*totalCount {
   652  		return bucketSembedded{id, 0}
   653  	}
   654  	low, mid, high := bkts[id-1].Lower, bkts[id-1].Upper, bkts[id].Upper
   655  	// If we choose to merge, err is the absolute estimate error for the previous bucket.
   656  	err := calcFraction4Causets(low, high, mid)*(preCount+count) - preCount
   657  	return bucketSembedded{id, math.Abs(err / (preCount + count))}
   658  }
   659  
   660  // defaultBucketCount is the number of buckets a column histogram has.
   661  var defaultBucketCount = 256
   662  
   663  func mergeBuckets(bkts []bucket, isNewBuckets []bool, totalCount float64) []bucket {
   664  	mergeCount := len(bkts) - defaultBucketCount
   665  	if mergeCount <= 0 {
   666  		return bkts
   667  	}
   668  	bs := make(bucketSembeddeds, 0, len(bkts))
   669  	for i := 1; i < len(bkts); i++ {
   670  		// Do not merge the newly created buckets.
   671  		if !isNewBuckets[i] && !isNewBuckets[i-1] {
   672  			bs = append(bs, getBucketSembedded(bkts, totalCount, i))
   673  		}
   674  	}
   675  	sort.Sort(bs)
   676  	ids := make([]int, 0, mergeCount)
   677  	for i := 0; i < mergeCount; i++ {
   678  		ids = append(ids, bs[i].id)
   679  	}
   680  	sort.Ints(ids)
   681  	idCursor, bktCursor := 0, 0
   682  	for i := range bkts {
   683  		// Merge this bucket with last one.
   684  		if idCursor < mergeCount && ids[idCursor] == i {
   685  			bkts[bktCursor-1].Upper = bkts[i].Upper
   686  			bkts[bktCursor-1].Count += bkts[i].Count
   687  			bkts[bktCursor-1].Repeat = bkts[i].Repeat
   688  			idCursor++
   689  		} else {
   690  			bkts[bktCursor] = bkts[i]
   691  			bktCursor++
   692  		}
   693  	}
   694  	bkts = bkts[:bktCursor]
   695  	return bkts
   696  }
   697  
   698  // splitBuckets split the histogram buckets according to the feedback.
   699  func splitBuckets(h *Histogram, feedback *QueryFeedback) ([]bucket, []bool, int64) {
   700  	bktID2FB, numTotalFBs := buildBucketFeedback(h, feedback)
   701  	buckets := make([]bucket, 0, h.Len())
   702  	isNewBuckets := make([]bool, 0, h.Len())
   703  	splitCount := getSplitCount(numTotalFBs, defaultBucketCount-h.Len())
   704  	for i := 0; i < h.Len(); i++ {
   705  		bktFB, ok := bktID2FB[i]
   706  		// No feedback, just use the original one.
   707  		if !ok {
   708  			buckets = append(buckets, bucket{h.GetLower(i), h.GetUpper(i), h.bucketCount(i), h.Buckets[i].Repeat})
   709  			isNewBuckets = append(isNewBuckets, false)
   710  			continue
   711  		}
   712  		// Distribute the total split count to bucket based on number of bucket feedback.
   713  		newBktNums := splitCount * len(bktFB.feedback) / numTotalFBs
   714  		bkts := bktFB.splitBucket(newBktNums, h.TotalRowCount(), float64(h.bucketCount(i)))
   715  		buckets = append(buckets, bkts...)
   716  		if len(bkts) == 1 {
   717  			isNewBuckets = append(isNewBuckets, false)
   718  		} else {
   719  			for i := 0; i < len(bkts); i++ {
   720  				isNewBuckets = append(isNewBuckets, true)
   721  			}
   722  		}
   723  	}
   724  	totCount := int64(0)
   725  	for _, bkt := range buckets {
   726  		totCount += bkt.Count
   727  	}
   728  	return buckets, isNewBuckets, totCount
   729  }
   730  
   731  // UFIDelateHistogram uFIDelates the histogram according buckets.
   732  func UFIDelateHistogram(h *Histogram, feedback *QueryFeedback) *Histogram {
   733  	buckets, isNewBuckets, totalCount := splitBuckets(h, feedback)
   734  	buckets = mergeBuckets(buckets, isNewBuckets, float64(totalCount))
   735  	hist := buildNewHistogram(h, buckets)
   736  	// UFIDelate the NDV of primary key column.
   737  	if feedback.Tp == PkType {
   738  		hist.NDV = int64(hist.TotalRowCount())
   739  	}
   740  	return hist
   741  }
   742  
   743  // UFIDelateCMSketch uFIDelates the CMSketch by feedback.
   744  func UFIDelateCMSketch(c *CMSketch, eqFeedbacks []Feedback) *CMSketch {
   745  	if c == nil || len(eqFeedbacks) == 0 {
   746  		return c
   747  	}
   748  	newCMSketch := c.Copy()
   749  	for _, fb := range eqFeedbacks {
   750  		newCMSketch.uFIDelateValueBytes(fb.Lower.GetBytes(), uint64(fb.Count))
   751  	}
   752  	return newCMSketch
   753  }
   754  
   755  func buildNewHistogram(h *Histogram, buckets []bucket) *Histogram {
   756  	hist := NewHistogram(h.ID, h.NDV, h.NullCount, h.LastUFIDelateVersion, h.Tp, len(buckets), h.TotDefCausSize)
   757  	preCount := int64(0)
   758  	for _, bkt := range buckets {
   759  		hist.AppendBucket(bkt.Lower, bkt.Upper, bkt.Count+preCount, bkt.Repeat)
   760  		preCount += bkt.Count
   761  	}
   762  	return hist
   763  }
   764  
   765  // queryFeedback is used to serialize the QueryFeedback.
   766  type queryFeedback struct {
   767  	IntRanges []int64
   768  	// HashValues is the murmur hash values for each index point.
   769  	// Note that index points will be stored in `IndexPoints`, we keep it here only for compatibility.
   770  	HashValues  []uint64
   771  	IndexRanges [][]byte
   772  	// IndexPoints stores the value of each equal condition.
   773  	IndexPoints [][]byte
   774  	// Counts is the number of scan keys in each range. It first stores the count for `IntRanges`, `IndexRanges` or `DeferredCausetRanges`.
   775  	// After that, it stores the Ranges for `HashValues`.
   776  	Counts               []int64
   777  	DeferredCausetRanges [][]byte
   778  }
   779  
   780  func encodePKFeedback(q *QueryFeedback) (*queryFeedback, error) {
   781  	pb := &queryFeedback{}
   782  	for _, fb := range q.Feedback {
   783  		// There is no need to uFIDelate the point queries.
   784  		if bytes.Compare(ekv.Key(fb.Lower.GetBytes()).PrefixNext(), fb.Upper.GetBytes()) >= 0 {
   785  			continue
   786  		}
   787  		_, low, err := codec.DecodeInt(fb.Lower.GetBytes())
   788  		if err != nil {
   789  			return nil, errors.Trace(err)
   790  		}
   791  		_, high, err := codec.DecodeInt(fb.Upper.GetBytes())
   792  		if err != nil {
   793  			return nil, errors.Trace(err)
   794  		}
   795  		pb.IntRanges = append(pb.IntRanges, low, high)
   796  		pb.Counts = append(pb.Counts, fb.Count)
   797  	}
   798  	return pb, nil
   799  }
   800  
   801  func encodeIndexFeedback(q *QueryFeedback) *queryFeedback {
   802  	pb := &queryFeedback{}
   803  	var pointCounts []int64
   804  	for _, fb := range q.Feedback {
   805  		if bytes.Compare(ekv.Key(fb.Lower.GetBytes()).PrefixNext(), fb.Upper.GetBytes()) >= 0 {
   806  			pb.IndexPoints = append(pb.IndexPoints, fb.Lower.GetBytes())
   807  			pointCounts = append(pointCounts, fb.Count)
   808  		} else {
   809  			pb.IndexRanges = append(pb.IndexRanges, fb.Lower.GetBytes(), fb.Upper.GetBytes())
   810  			pb.Counts = append(pb.Counts, fb.Count)
   811  		}
   812  	}
   813  	pb.Counts = append(pb.Counts, pointCounts...)
   814  	return pb
   815  }
   816  
   817  func encodeDeferredCausetFeedback(q *QueryFeedback) (*queryFeedback, error) {
   818  	pb := &queryFeedback{}
   819  	sc := stmtctx.StatementContext{TimeZone: time.UTC}
   820  	for _, fb := range q.Feedback {
   821  		lowerBytes, err := codec.EncodeKey(&sc, nil, *fb.Lower)
   822  		if err != nil {
   823  			return nil, errors.Trace(err)
   824  		}
   825  		upperBytes, err := codec.EncodeKey(&sc, nil, *fb.Upper)
   826  		if err != nil {
   827  			return nil, errors.Trace(err)
   828  		}
   829  		pb.DeferredCausetRanges = append(pb.DeferredCausetRanges, lowerBytes, upperBytes)
   830  		pb.Counts = append(pb.Counts, fb.Count)
   831  	}
   832  	return pb, nil
   833  }
   834  
   835  // EncodeFeedback encodes the given feedback to byte slice.
   836  func EncodeFeedback(q *QueryFeedback) ([]byte, error) {
   837  	var pb *queryFeedback
   838  	var err error
   839  	switch q.Tp {
   840  	case PkType:
   841  		pb, err = encodePKFeedback(q)
   842  	case IndexType:
   843  		pb = encodeIndexFeedback(q)
   844  	case DefCausType:
   845  		pb, err = encodeDeferredCausetFeedback(q)
   846  	}
   847  	if err != nil {
   848  		return nil, errors.Trace(err)
   849  	}
   850  	var buf bytes.Buffer
   851  	enc := gob.NewCausetEncoder(&buf)
   852  	err = enc.Encode(pb)
   853  	return buf.Bytes(), errors.Trace(err)
   854  }
   855  
   856  func decodeFeedbackForIndex(q *QueryFeedback, pb *queryFeedback, c *CMSketch) {
   857  	q.Tp = IndexType
   858  	// decode the index range feedback
   859  	for i := 0; i < len(pb.IndexRanges); i += 2 {
   860  		lower, upper := types.NewBytesCauset(pb.IndexRanges[i]), types.NewBytesCauset(pb.IndexRanges[i+1])
   861  		q.Feedback = append(q.Feedback, Feedback{&lower, &upper, pb.Counts[i/2], 0})
   862  	}
   863  	if c != nil {
   864  		// decode the index point feedback, just set value count in CM Sketch
   865  		start := len(pb.IndexRanges) / 2
   866  		if len(pb.HashValues) > 0 {
   867  			// It needs raw values to uFIDelate the top n, so just skip it here.
   868  			if len(c.topN) > 0 {
   869  				return
   870  			}
   871  			for i := 0; i < len(pb.HashValues); i += 2 {
   872  				c.setValue(pb.HashValues[i], pb.HashValues[i+1], uint64(pb.Counts[start+i/2]))
   873  			}
   874  			return
   875  		}
   876  		for i := 0; i < len(pb.IndexPoints); i++ {
   877  			c.uFIDelateValueBytes(pb.IndexPoints[i], uint64(pb.Counts[start+i]))
   878  		}
   879  	}
   880  }
   881  
   882  func decodeFeedbackForPK(q *QueryFeedback, pb *queryFeedback, isUnsigned bool) {
   883  	q.Tp = PkType
   884  	// decode feedback for primary key
   885  	for i := 0; i < len(pb.IntRanges); i += 2 {
   886  		var lower, upper types.Causet
   887  		if isUnsigned {
   888  			lower.SetUint64(uint64(pb.IntRanges[i]))
   889  			upper.SetUint64(uint64(pb.IntRanges[i+1]))
   890  		} else {
   891  			lower.SetInt64(pb.IntRanges[i])
   892  			upper.SetInt64(pb.IntRanges[i+1])
   893  		}
   894  		q.Feedback = append(q.Feedback, Feedback{&lower, &upper, pb.Counts[i/2], 0})
   895  	}
   896  }
   897  
   898  // ConvertCausetsType converts the datums type to `ft`.
   899  func ConvertCausetsType(vals []types.Causet, ft *types.FieldType, loc *time.Location) error {
   900  	for i, val := range vals {
   901  		if val.HoTT() == types.HoTTMinNotNull || val.HoTT() == types.HoTTMaxValue {
   902  			continue
   903  		}
   904  		newVal, err := blockcodec.UnflattenCausets([]types.Causet{val}, []*types.FieldType{ft}, loc)
   905  		if err != nil {
   906  			return err
   907  		}
   908  		vals[i] = newVal[0]
   909  	}
   910  	return nil
   911  }
   912  
   913  func decodeDeferredCausetBounds(data []byte, ft *types.FieldType) ([]types.Causet, error) {
   914  	vals, _, err := codec.DecodeRange(data, 1, nil, nil)
   915  	if err != nil {
   916  		return nil, err
   917  	}
   918  	err = ConvertCausetsType(vals, ft, time.UTC)
   919  	return vals, err
   920  }
   921  
   922  func decodeFeedbackForDeferredCauset(q *QueryFeedback, pb *queryFeedback, ft *types.FieldType) error {
   923  	q.Tp = DefCausType
   924  	for i := 0; i < len(pb.DeferredCausetRanges); i += 2 {
   925  		low, err := decodeDeferredCausetBounds(pb.DeferredCausetRanges[i], ft)
   926  		if err != nil {
   927  			return err
   928  		}
   929  		high, err := decodeDeferredCausetBounds(pb.DeferredCausetRanges[i+1], ft)
   930  		if err != nil {
   931  			return err
   932  		}
   933  		q.Feedback = append(q.Feedback, Feedback{&low[0], &high[0], pb.Counts[i/2], 0})
   934  	}
   935  	return nil
   936  }
   937  
   938  // DecodeFeedback decodes a byte slice to feedback.
   939  func DecodeFeedback(val []byte, q *QueryFeedback, c *CMSketch, ft *types.FieldType) error {
   940  	buf := bytes.NewBuffer(val)
   941  	dec := gob.NewCausetDecoder(buf)
   942  	pb := &queryFeedback{}
   943  	err := dec.Decode(pb)
   944  	if err != nil {
   945  		return errors.Trace(err)
   946  	}
   947  	if len(pb.IndexRanges) > 0 || len(pb.HashValues) > 0 || len(pb.IndexPoints) > 0 {
   948  		decodeFeedbackForIndex(q, pb, c)
   949  	} else if len(pb.IntRanges) > 0 {
   950  		decodeFeedbackForPK(q, pb, allegrosql.HasUnsignedFlag(ft.Flag))
   951  	} else {
   952  		err = decodeFeedbackForDeferredCauset(q, pb, ft)
   953  	}
   954  	return err
   955  }
   956  
   957  // SplitFeedbackByQueryType splits the feedbacks into equality feedbacks and range feedbacks.
   958  func SplitFeedbackByQueryType(feedbacks []Feedback) ([]Feedback, []Feedback) {
   959  	var eqFB, ranFB []Feedback
   960  	for _, fb := range feedbacks {
   961  		// Use `>=` here because sometimes the lower is equal to upper.
   962  		if bytes.Compare(ekv.Key(fb.Lower.GetBytes()).PrefixNext(), fb.Upper.GetBytes()) >= 0 {
   963  			eqFB = append(eqFB, fb)
   964  		} else {
   965  			ranFB = append(ranFB, fb)
   966  		}
   967  	}
   968  	return eqFB, ranFB
   969  }
   970  
   971  // setNextValue sets the next value for the given causet. For types like float,
   972  // we do not set because it is not discrete and does not matter too much when estimating the scalar info.
   973  func setNextValue(d *types.Causet) {
   974  	switch d.HoTT() {
   975  	case types.HoTTBytes, types.HoTTString:
   976  		// Here is the encoded value instead of string value, so SetBytes is enough.
   977  		d.SetBytes(ekv.Key(d.GetBytes()).PrefixNext())
   978  	case types.HoTTInt64:
   979  		d.SetInt64(d.GetInt64() + 1)
   980  	case types.HoTTUint64:
   981  		d.SetUint64(d.GetUint64() + 1)
   982  	case types.HoTTMysqlDuration:
   983  		duration := d.GetMysqlDuration()
   984  		duration.Duration = duration.Duration + 1
   985  		d.SetMysqlDuration(duration)
   986  	case types.HoTTMysqlTime:
   987  		t := d.GetMysqlTime()
   988  		sc := &stmtctx.StatementContext{TimeZone: types.BoundTimezone}
   989  		if _, err := t.Add(sc, types.Duration{Duration: 1, Fsp: 0}); err != nil {
   990  			log.Error(errors.ErrorStack(err))
   991  		}
   992  		d.SetMysqlTime(t)
   993  	}
   994  }
   995  
   996  // SupportDeferredCausetType checks if the type of the column can be uFIDelated by feedback.
   997  func SupportDeferredCausetType(ft *types.FieldType) bool {
   998  	switch ft.Tp {
   999  	case allegrosql.TypeTiny, allegrosql.TypeShort, allegrosql.TypeInt24, allegrosql.TypeLong, allegrosql.TypeLonglong, allegrosql.TypeFloat,
  1000  		allegrosql.TypeDouble, allegrosql.TypeString, allegrosql.TypeVarString, allegrosql.TypeVarchar, allegrosql.TypeBlob, allegrosql.TypeTinyBlob, allegrosql.TypeMediumBlob, allegrosql.TypeLongBlob,
  1001  		allegrosql.TypeNewDecimal, allegrosql.TypeDuration, allegrosql.TypeDate, allegrosql.TypeDatetime, allegrosql.TypeTimestamp:
  1002  		return true
  1003  	}
  1004  	return false
  1005  }