github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/histogram.go

github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/histogram.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"strings"
    22  	"time"
    23  	"unsafe"
    24  
    25  	"github.com/twmb/murmur3"
    26  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    27  	"github.com/whtcorpsinc/BerolinaSQL/perceptron"
    28  	"github.com/whtcorpsinc/BerolinaSQL/terror"
    29  	"github.com/whtcorpsinc/errors"
    30  	"github.com/whtcorpsinc/fidelpb/go-fidelpb"
    31  	"github.com/whtcorpsinc/milevadb/blockcodec"
    32  	"github.com/whtcorpsinc/milevadb/ekv"
    33  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    34  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    35  	"github.com/whtcorpsinc/milevadb/soliton/collate"
    36  	"github.com/whtcorpsinc/milevadb/soliton/logutil"
    37  	"github.com/whtcorpsinc/milevadb/soliton/ranger"
    38  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    39  	"github.com/whtcorpsinc/milevadb/stochastikctx/variable"
    40  	"github.com/whtcorpsinc/milevadb/types"
    41  	"go.uber.org/zap"
    42  )
    43  
    44  // Histogram represents statistics for a column or index.
    45  type Histogram struct {
    46  	ID        int64 // DeferredCauset ID.
    47  	NDV       int64 // Number of distinct values.
    48  	NullCount int64 // Number of null values.
    49  	// LastUFIDelateVersion is the version that this histogram uFIDelated last time.
    50  	LastUFIDelateVersion uint64
    51  
    52  	Tp *types.FieldType
    53  
    54  	// Histogram elements.
    55  	//
    56  	// A bucket bound is the smallest and greatest values stored in the bucket. The lower and upper bound
    57  	// are stored in one column.
    58  	//
    59  	// A bucket count is the number of items stored in all previous buckets and the current bucket.
    60  	// Bucket counts are always in increasing order.
    61  	//
    62  	// A bucket repeat is the number of repeats of the bucket value, it can be used to find popular values.
    63  	Bounds  *chunk.Chunk
    64  	Buckets []Bucket
    65  
    66  	// Used for estimating fraction of the interval [lower, upper] that lies within the [lower, value].
    67  	// For some types like `Int`, we do not build it because we can get them directly from `Bounds`.
    68  	scalars []scalar
    69  	// TotDefCausSize is the total column size for the histogram.
    70  	// For unfixed-len types, it includes LEN and BYTE.
    71  	TotDefCausSize int64
    72  
    73  	// Correlation is the statistical correlation between physical event ordering and logical ordering of
    74  	// the column values. This ranges from -1 to +1, and it is only valid for DeferredCauset histogram, not for
    75  	// Index histogram.
    76  	Correlation float64
    77  }
    78  
    79  // Bucket causetstore the bucket count and repeat.
    80  type Bucket struct {
    81  	Count  int64
    82  	Repeat int64
    83  }
    84  
    85  type scalar struct {
    86  	lower        float64
    87  	upper        float64
    88  	commonPfxLen int // commonPfxLen is the common prefix length of the lower bound and upper bound when the value type is HoTTString or HoTTBytes.
    89  }
    90  
    91  // NewHistogram creates a new histogram.
    92  func NewHistogram(id, ndv, nullCount int64, version uint64, tp *types.FieldType, bucketSize int, totDefCausSize int64) *Histogram {
    93  	return &Histogram{
    94  		ID:                   id,
    95  		NDV:                  ndv,
    96  		NullCount:            nullCount,
    97  		LastUFIDelateVersion: version,
    98  		Tp:                   tp,
    99  		Bounds:               chunk.NewChunkWithCapacity([]*types.FieldType{tp}, 2*bucketSize),
   100  		Buckets:              make([]Bucket, 0, bucketSize),
   101  		TotDefCausSize:       totDefCausSize,
   102  	}
   103  }
   104  
   105  // GetLower gets the lower bound of bucket `idx`.
   106  func (hg *Histogram) GetLower(idx int) *types.Causet {
   107  	d := hg.Bounds.GetRow(2*idx).GetCauset(0, hg.Tp)
   108  	return &d
   109  }
   110  
   111  // GetUpper gets the upper bound of bucket `idx`.
   112  func (hg *Histogram) GetUpper(idx int) *types.Causet {
   113  	d := hg.Bounds.GetRow(2*idx+1).GetCauset(0, hg.Tp)
   114  	return &d
   115  }
   116  
   117  // MemoryUsage returns the total memory usage of this Histogram.
   118  // everytime changed the Histogram of the causet, it will cost O(n)
   119  // complexity so calculate the memoryUsage might cost little time.
   120  // We ignore the size of other spacetimedata in Histogram.
   121  func (hg *Histogram) MemoryUsage() (sum int64) {
   122  	if hg == nil {
   123  		return
   124  	}
   125  	sum = hg.Bounds.MemoryUsage() + int64(cap(hg.Buckets)*int(unsafe.Sizeof(Bucket{}))) + int64(cap(hg.scalars)*int(unsafe.Sizeof(scalar{})))
   126  	return
   127  }
   128  
   129  // AvgDefCausSize is the average column size of the histogram. These sizes are derived from function `encode`
   130  // and `Causet::ConvertTo`, so we need to uFIDelate them if those 2 functions are changed.
   131  func (c *DeferredCauset) AvgDefCausSize(count int64, isKey bool) float64 {
   132  	if count == 0 {
   133  		return 0
   134  	}
   135  	// Note that, if the handle column is encoded as value, instead of key, i.e,
   136  	// when the handle column is in a unique index, the real column size may be
   137  	// smaller than 8 because it is encoded using `EncodeVarint`. Since we don't
   138  	// know the exact value size now, use 8 as approximation.
   139  	if c.IsHandle {
   140  		return 8
   141  	}
   142  	histCount := c.TotalRowCount()
   143  	notNullRatio := 1.0
   144  	if histCount > 0 {
   145  		notNullRatio = 1.0 - float64(c.NullCount)/histCount
   146  	}
   147  	switch c.Histogram.Tp.Tp {
   148  	case allegrosql.TypeFloat, allegrosql.TypeDouble, allegrosql.TypeDuration, allegrosql.TypeDate, allegrosql.TypeDatetime, allegrosql.TypeTimestamp:
   149  		return 8 * notNullRatio
   150  	case allegrosql.TypeTiny, allegrosql.TypeShort, allegrosql.TypeInt24, allegrosql.TypeLong, allegrosql.TypeLonglong, allegrosql.TypeYear, allegrosql.TypeEnum, allegrosql.TypeBit, allegrosql.TypeSet:
   151  		if isKey {
   152  			return 8 * notNullRatio
   153  		}
   154  	}
   155  	// Keep two decimal place.
   156  	return math.Round(float64(c.TotDefCausSize)/float64(count)*100) / 100
   157  }
   158  
   159  // AvgDefCausSizeChunkFormat is the average column size of the histogram. These sizes are derived from function `Encode`
   160  // and `DecodeToChunk`, so we need to uFIDelate them if those 2 functions are changed.
   161  func (c *DeferredCauset) AvgDefCausSizeChunkFormat(count int64) float64 {
   162  	if count == 0 {
   163  		return 0
   164  	}
   165  	fixedLen := chunk.GetFixedLen(c.Histogram.Tp)
   166  	if fixedLen != -1 {
   167  		return float64(fixedLen)
   168  	}
   169  	// Keep two decimal place.
   170  	// Add 8 bytes for unfixed-len type's offsets.
   171  	// Minus Log2(avgSize) for unfixed-len type LEN.
   172  	avgSize := float64(c.TotDefCausSize) / float64(count)
   173  	if avgSize < 1 {
   174  		return math.Round(avgSize*100)/100 + 8
   175  	}
   176  	return math.Round((avgSize-math.Log2(avgSize))*100)/100 + 8
   177  }
   178  
   179  // AvgDefCausSizeListInDisk is the average column size of the histogram. These sizes are derived
   180  // from `chunk.ListInDisk` so we need to uFIDelate them if those 2 functions are changed.
   181  func (c *DeferredCauset) AvgDefCausSizeListInDisk(count int64) float64 {
   182  	if count == 0 {
   183  		return 0
   184  	}
   185  	histCount := c.TotalRowCount()
   186  	notNullRatio := 1.0
   187  	if histCount > 0 {
   188  		notNullRatio = 1.0 - float64(c.NullCount)/histCount
   189  	}
   190  	size := chunk.GetFixedLen(c.Histogram.Tp)
   191  	if size != -1 {
   192  		return float64(size) * notNullRatio
   193  	}
   194  	// Keep two decimal place.
   195  	// Minus Log2(avgSize) for unfixed-len type LEN.
   196  	avgSize := float64(c.TotDefCausSize) / float64(count)
   197  	if avgSize < 1 {
   198  		return math.Round((avgSize)*100) / 100
   199  	}
   200  	return math.Round((avgSize-math.Log2(avgSize))*100) / 100
   201  }
   202  
   203  // AppendBucket appends a bucket into `hg`.
   204  func (hg *Histogram) AppendBucket(lower *types.Causet, upper *types.Causet, count, repeat int64) {
   205  	hg.Buckets = append(hg.Buckets, Bucket{Count: count, Repeat: repeat})
   206  	hg.Bounds.AppendCauset(0, lower)
   207  	hg.Bounds.AppendCauset(0, upper)
   208  }
   209  
   210  func (hg *Histogram) uFIDelateLastBucket(upper *types.Causet, count, repeat int64) {
   211  	len := hg.Len()
   212  	hg.Bounds.TruncateTo(2*len - 1)
   213  	hg.Bounds.AppendCauset(0, upper)
   214  	hg.Buckets[len-1] = Bucket{Count: count, Repeat: repeat}
   215  }
   216  
   217  // DecodeTo decodes the histogram bucket values into `Tp`.
   218  func (hg *Histogram) DecodeTo(tp *types.FieldType, timeZone *time.Location) error {
   219  	oldIter := chunk.NewIterator4Chunk(hg.Bounds)
   220  	hg.Bounds = chunk.NewChunkWithCapacity([]*types.FieldType{tp}, oldIter.Len())
   221  	hg.Tp = tp
   222  	for event := oldIter.Begin(); event != oldIter.End(); event = oldIter.Next() {
   223  		causet, err := blockcodec.DecodeDeferredCausetValue(event.GetBytes(0), tp, timeZone)
   224  		if err != nil {
   225  			return errors.Trace(err)
   226  		}
   227  		hg.Bounds.AppendCauset(0, &causet)
   228  	}
   229  	return nil
   230  }
   231  
   232  // ConvertTo converts the histogram bucket values into `Tp`.
   233  func (hg *Histogram) ConvertTo(sc *stmtctx.StatementContext, tp *types.FieldType) (*Histogram, error) {
   234  	hist := NewHistogram(hg.ID, hg.NDV, hg.NullCount, hg.LastUFIDelateVersion, tp, hg.Len(), hg.TotDefCausSize)
   235  	hist.Correlation = hg.Correlation
   236  	iter := chunk.NewIterator4Chunk(hg.Bounds)
   237  	for event := iter.Begin(); event != iter.End(); event = iter.Next() {
   238  		d := event.GetCauset(0, hg.Tp)
   239  		d, err := d.ConvertTo(sc, tp)
   240  		if err != nil {
   241  			return nil, errors.Trace(err)
   242  		}
   243  		hist.Bounds.AppendCauset(0, &d)
   244  	}
   245  	hist.Buckets = hg.Buckets
   246  	return hist, nil
   247  }
   248  
   249  // Len is the number of buckets in the histogram.
   250  func (hg *Histogram) Len() int {
   251  	return len(hg.Buckets)
   252  }
   253  
   254  // HistogramEqual tests if two histograms are equal.
   255  func HistogramEqual(a, b *Histogram, ignoreID bool) bool {
   256  	if ignoreID {
   257  		old := b.ID
   258  		b.ID = a.ID
   259  		defer func() { b.ID = old }()
   260  	}
   261  	return bytes.Equal([]byte(a.ToString(0)), []byte(b.ToString(0)))
   262  }
   263  
   264  // constants for stats version. These const can be used for solving compatibility issue.
   265  const (
   266  	CurStatsVersion = Version1
   267  	Version1        = 1
   268  )
   269  
   270  // AnalyzeFlag is set when the statistics comes from analyze and has not been modified by feedback.
   271  const AnalyzeFlag = 1
   272  
   273  // IsAnalyzed checks whether this flag contains AnalyzeFlag.
   274  func IsAnalyzed(flag int64) bool {
   275  	return (flag & AnalyzeFlag) > 0
   276  }
   277  
   278  // ResetAnalyzeFlag resets the AnalyzeFlag because it has been modified by feedback.
   279  func ResetAnalyzeFlag(flag int64) int64 {
   280  	return flag &^ AnalyzeFlag
   281  }
   282  
   283  // ValueToString converts a possible encoded value to a formatted string. If the value is encoded, then
   284  // idxDefCauss equals to number of origin values, else idxDefCauss is 0.
   285  func ValueToString(vars *variable.StochastikVars, value *types.Causet, idxDefCauss int, idxDeferredCausetTypes []byte) (string, error) {
   286  	if idxDefCauss == 0 {
   287  		return value.ToString()
   288  	}
   289  	var loc *time.Location
   290  	if vars != nil {
   291  		loc = vars.Location()
   292  	}
   293  	// Ignore the error and treat remaining part that cannot decode successfully as bytes.
   294  	decodedVals, remained, err := codec.DecodeRange(value.GetBytes(), idxDefCauss, idxDeferredCausetTypes, loc)
   295  	// Ignore err explicit to pass errcheck.
   296  	_ = err
   297  	if len(remained) > 0 {
   298  		decodedVals = append(decodedVals, types.NewBytesCauset(remained))
   299  	}
   300  	str, err := types.CausetsToString(decodedVals, true)
   301  	return str, err
   302  }
   303  
   304  // BucketToString change the given bucket to string format.
   305  func (hg *Histogram) BucketToString(bktID, idxDefCauss int) string {
   306  	upperVal, err := ValueToString(nil, hg.GetUpper(bktID), idxDefCauss, nil)
   307  	terror.Log(errors.Trace(err))
   308  	lowerVal, err := ValueToString(nil, hg.GetLower(bktID), idxDefCauss, nil)
   309  	terror.Log(errors.Trace(err))
   310  	return fmt.Sprintf("num: %d lower_bound: %s upper_bound: %s repeats: %d", hg.bucketCount(bktID), lowerVal, upperVal, hg.Buckets[bktID].Repeat)
   311  }
   312  
   313  // ToString gets the string representation for the histogram.
   314  func (hg *Histogram) ToString(idxDefCauss int) string {
   315  	strs := make([]string, 0, hg.Len()+1)
   316  	if idxDefCauss > 0 {
   317  		strs = append(strs, fmt.Sprintf("index:%d ndv:%d", hg.ID, hg.NDV))
   318  	} else {
   319  		strs = append(strs, fmt.Sprintf("column:%d ndv:%d totDefCausSize:%d", hg.ID, hg.NDV, hg.TotDefCausSize))
   320  	}
   321  	for i := 0; i < hg.Len(); i++ {
   322  		strs = append(strs, hg.BucketToString(i, idxDefCauss))
   323  	}
   324  	return strings.Join(strs, "\n")
   325  }
   326  
   327  // equalRowCount estimates the event count where the column equals to value.
   328  func (hg *Histogram) equalRowCount(value types.Causet) float64 {
   329  	index, match := hg.Bounds.LowerBound(0, &value)
   330  	// Since we causetstore the lower and upper bound together, if the index is an odd number, then it points to a upper bound.
   331  	if index%2 == 1 {
   332  		if match {
   333  			return float64(hg.Buckets[index/2].Repeat)
   334  		}
   335  		return hg.notNullCount() / float64(hg.NDV)
   336  	}
   337  	if match {
   338  		cmp := chunk.GetCompareFunc(hg.Tp)
   339  		if cmp(hg.Bounds.GetRow(index), 0, hg.Bounds.GetRow(index+1), 0) == 0 {
   340  			return float64(hg.Buckets[index/2].Repeat)
   341  		}
   342  		return hg.notNullCount() / float64(hg.NDV)
   343  	}
   344  	return 0
   345  }
   346  
   347  // greaterRowCount estimates the event count where the column greater than value.
   348  func (hg *Histogram) greaterRowCount(value types.Causet) float64 {
   349  	gtCount := hg.notNullCount() - hg.lessRowCount(value) - hg.equalRowCount(value)
   350  	return math.Max(0, gtCount)
   351  }
   352  
   353  // LessRowCountWithBktIdx estimates the event count where the column less than value.
   354  func (hg *Histogram) LessRowCountWithBktIdx(value types.Causet) (float64, int) {
   355  	// All the values are null.
   356  	if hg.Bounds.NumRows() == 0 {
   357  		return 0, 0
   358  	}
   359  	index, match := hg.Bounds.LowerBound(0, &value)
   360  	if index == hg.Bounds.NumRows() {
   361  		return hg.notNullCount(), hg.Len() - 1
   362  	}
   363  	// Since we causetstore the lower and upper bound together, so dividing the index by 2 will get the bucket index.
   364  	bucketIdx := index / 2
   365  	curCount, curRepeat := float64(hg.Buckets[bucketIdx].Count), float64(hg.Buckets[bucketIdx].Repeat)
   366  	preCount := float64(0)
   367  	if bucketIdx > 0 {
   368  		preCount = float64(hg.Buckets[bucketIdx-1].Count)
   369  	}
   370  	if index%2 == 1 {
   371  		if match {
   372  			return curCount - curRepeat, bucketIdx
   373  		}
   374  		return preCount + hg.calcFraction(bucketIdx, &value)*(curCount-curRepeat-preCount), bucketIdx
   375  	}
   376  	return preCount, bucketIdx
   377  }
   378  
   379  func (hg *Histogram) lessRowCount(value types.Causet) float64 {
   380  	result, _ := hg.LessRowCountWithBktIdx(value)
   381  	return result
   382  }
   383  
   384  // BetweenRowCount estimates the event count where column greater or equal to a and less than b.
   385  func (hg *Histogram) BetweenRowCount(a, b types.Causet) float64 {
   386  	lessCountA := hg.lessRowCount(a)
   387  	lessCountB := hg.lessRowCount(b)
   388  	// If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate
   389  	// the fraction, so we use `totalCount / NDV` to estimate the event count, but the result should not greater than
   390  	// lessCountB or notNullCount-lessCountA.
   391  	if lessCountA >= lessCountB && hg.NDV > 0 {
   392  		result := math.Min(lessCountB, hg.notNullCount()-lessCountA)
   393  		return math.Min(result, hg.notNullCount()/float64(hg.NDV))
   394  	}
   395  	return lessCountB - lessCountA
   396  }
   397  
   398  // TotalRowCount returns the total count of this histogram.
   399  func (hg *Histogram) TotalRowCount() float64 {
   400  	return hg.notNullCount() + float64(hg.NullCount)
   401  }
   402  
   403  // notNullCount indicates the count of non-null values in column histogram and single-column index histogram,
   404  // for multi-column index histogram, since we cannot define null for the event, we treat all rows as non-null, that means,
   405  // notNullCount would return same value as TotalRowCount for multi-column index histograms.
   406  func (hg *Histogram) notNullCount() float64 {
   407  	if hg.Len() == 0 {
   408  		return 0
   409  	}
   410  	return float64(hg.Buckets[hg.Len()-1].Count)
   411  }
   412  
   413  // mergeBuckets is used to Merge every two neighbor buckets.
   414  func (hg *Histogram) mergeBuckets(bucketIdx int) {
   415  	curBuck := 0
   416  	c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp}, bucketIdx)
   417  	for i := 0; i+1 <= bucketIdx; i += 2 {
   418  		hg.Buckets[curBuck] = hg.Buckets[i+1]
   419  		c.AppendCauset(0, hg.GetLower(i))
   420  		c.AppendCauset(0, hg.GetUpper(i+1))
   421  		curBuck++
   422  	}
   423  	if bucketIdx%2 == 0 {
   424  		hg.Buckets[curBuck] = hg.Buckets[bucketIdx]
   425  		c.AppendCauset(0, hg.GetLower(bucketIdx))
   426  		c.AppendCauset(0, hg.GetUpper(bucketIdx))
   427  		curBuck++
   428  	}
   429  	hg.Bounds = c
   430  	hg.Buckets = hg.Buckets[:curBuck]
   431  }
   432  
   433  // GetIncreaseFactor will return a factor of data increasing after the last analysis.
   434  func (hg *Histogram) GetIncreaseFactor(totalCount int64) float64 {
   435  	columnCount := hg.TotalRowCount()
   436  	if columnCount == 0 {
   437  		// avoid dividing by 0
   438  		return 1.0
   439  	}
   440  	return float64(totalCount) / columnCount
   441  }
   442  
   443  // validRange checks if the range is Valid, it is used by `SplitRange` to remove the invalid range,
   444  // the possible types of range are index key range and handle key range.
   445  func validRange(sc *stmtctx.StatementContext, ran *ranger.Range, encoded bool) bool {
   446  	var low, high []byte
   447  	if encoded {
   448  		low, high = ran.LowVal[0].GetBytes(), ran.HighVal[0].GetBytes()
   449  	} else {
   450  		var err error
   451  		low, err = codec.EncodeKey(sc, nil, ran.LowVal[0])
   452  		if err != nil {
   453  			return false
   454  		}
   455  		high, err = codec.EncodeKey(sc, nil, ran.HighVal[0])
   456  		if err != nil {
   457  			return false
   458  		}
   459  	}
   460  	if ran.LowExclude {
   461  		low = ekv.Key(low).PrefixNext()
   462  	}
   463  	if !ran.HighExclude {
   464  		high = ekv.Key(high).PrefixNext()
   465  	}
   466  	return bytes.Compare(low, high) < 0
   467  }
   468  
   469  func checkHoTT(vals []types.Causet, HoTT byte) bool {
   470  	if HoTT == types.HoTTString {
   471  		HoTT = types.HoTTBytes
   472  	}
   473  	for _, val := range vals {
   474  		valHoTT := val.HoTT()
   475  		if valHoTT == types.HoTTNull || valHoTT == types.HoTTMinNotNull || valHoTT == types.HoTTMaxValue {
   476  			continue
   477  		}
   478  		if valHoTT == types.HoTTString {
   479  			valHoTT = types.HoTTBytes
   480  		}
   481  		if valHoTT != HoTT {
   482  			return false
   483  		}
   484  		// Only check the first non-null value.
   485  		break
   486  	}
   487  	return true
   488  }
   489  
   490  func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool {
   491  	HoTT := hg.GetLower(0).HoTT()
   492  	for _, ran := range ranges {
   493  		if !checkHoTT(ran.LowVal, HoTT) || !checkHoTT(ran.HighVal, HoTT) {
   494  			return false
   495  		}
   496  	}
   497  	return true
   498  }
   499  
   500  // SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound
   501  // as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)),
   502  // [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound.
   503  func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) {
   504  	if !hg.typeMatch(oldRanges) {
   505  		return oldRanges, false
   506  	}
   507  	// Treat the only buckets as (-inf, +inf), so we do not need split it.
   508  	if hg.Len() == 1 {
   509  		return oldRanges, true
   510  	}
   511  	ranges := make([]*ranger.Range, 0, len(oldRanges))
   512  	for _, ran := range oldRanges {
   513  		ranges = append(ranges, ran.Clone())
   514  	}
   515  	split := make([]*ranger.Range, 0, len(ranges))
   516  	for len(ranges) > 0 {
   517  		// Find the first bound that greater than the LowVal.
   518  		idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0])
   519  		// Treat last bucket's upper bound as +inf, so we do not need split any more.
   520  		if idx >= hg.Bounds.NumRows()-1 {
   521  			split = append(split, ranges...)
   522  			break
   523  		}
   524  		// Treat first buckets's lower bound as -inf, just increase it to the next lower bound.
   525  		if idx == 0 {
   526  			idx = 2
   527  		}
   528  		// Get the next lower bound.
   529  		if idx%2 == 1 {
   530  			idx++
   531  		}
   532  		lowerBound := hg.Bounds.GetRow(idx)
   533  		var i int
   534  		// Find the first range that need to be split by the lower bound.
   535  		for ; i < len(ranges); i++ {
   536  			if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 {
   537  				break
   538  			}
   539  		}
   540  		split = append(split, ranges[:i]...)
   541  		ranges = ranges[i:]
   542  		if len(ranges) == 0 {
   543  			break
   544  		}
   545  		// Split according to the lower bound.
   546  		cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0])
   547  		if cmp > 0 {
   548  			lower := lowerBound.GetCauset(0, hg.Tp)
   549  			newRange := &ranger.Range{
   550  				LowExclude:  ranges[0].LowExclude,
   551  				LowVal:      []types.Causet{ranges[0].LowVal[0]},
   552  				HighVal:     []types.Causet{lower},
   553  				HighExclude: true}
   554  			if validRange(sc, newRange, encoded) {
   555  				split = append(split, newRange)
   556  			}
   557  			ranges[0].LowVal[0] = lower
   558  			ranges[0].LowExclude = false
   559  			if !validRange(sc, ranges[0], encoded) {
   560  				ranges = ranges[1:]
   561  			}
   562  		}
   563  	}
   564  	return split, true
   565  }
   566  
   567  func (hg *Histogram) bucketCount(idx int) int64 {
   568  	if idx == 0 {
   569  		return hg.Buckets[0].Count
   570  	}
   571  	return hg.Buckets[idx].Count - hg.Buckets[idx-1].Count
   572  }
   573  
   574  // HistogramToProto converts Histogram to its protobuf representation.
   575  // Note that when this is used, the lower/upper bound in the bucket must be BytesCauset.
   576  func HistogramToProto(hg *Histogram) *fidelpb.Histogram {
   577  	protoHg := &fidelpb.Histogram{
   578  		Ndv: hg.NDV,
   579  	}
   580  	for i := 0; i < hg.Len(); i++ {
   581  		bkt := &fidelpb.Bucket{
   582  			Count:      hg.Buckets[i].Count,
   583  			LowerBound: hg.GetLower(i).GetBytes(),
   584  			UpperBound: hg.GetUpper(i).GetBytes(),
   585  			Repeats:    hg.Buckets[i].Repeat,
   586  		}
   587  		protoHg.Buckets = append(protoHg.Buckets, bkt)
   588  	}
   589  	return protoHg
   590  }
   591  
   592  // HistogramFromProto converts Histogram from its protobuf representation.
   593  // Note that we will set BytesCauset for the lower/upper bound in the bucket, the decode will
   594  // be after all histograms merged.
   595  func HistogramFromProto(protoHg *fidelpb.Histogram) *Histogram {
   596  	tp := types.NewFieldType(allegrosql.TypeBlob)
   597  	hg := NewHistogram(0, protoHg.Ndv, 0, 0, tp, len(protoHg.Buckets), 0)
   598  	for _, bucket := range protoHg.Buckets {
   599  		lower, upper := types.NewBytesCauset(bucket.LowerBound), types.NewBytesCauset(bucket.UpperBound)
   600  		hg.AppendBucket(&lower, &upper, bucket.Count, bucket.Repeats)
   601  	}
   602  	return hg
   603  }
   604  
   605  func (hg *Histogram) popFirstBucket() {
   606  	hg.Buckets = hg.Buckets[1:]
   607  	c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp, hg.Tp}, hg.Bounds.NumRows()-2)
   608  	c.Append(hg.Bounds, 2, hg.Bounds.NumRows())
   609  	hg.Bounds = c
   610  }
   611  
   612  // IsIndexHist checks whether current histogram is one for index.
   613  func (hg *Histogram) IsIndexHist() bool {
   614  	return hg.Tp.Tp == allegrosql.TypeBlob
   615  }
   616  
   617  // MergeHistograms merges two histograms.
   618  func MergeHistograms(sc *stmtctx.StatementContext, lh *Histogram, rh *Histogram, bucketSize int) (*Histogram, error) {
   619  	if lh.Len() == 0 {
   620  		return rh, nil
   621  	}
   622  	if rh.Len() == 0 {
   623  		return lh, nil
   624  	}
   625  	lh.NDV += rh.NDV
   626  	lLen := lh.Len()
   627  	cmp, err := lh.GetUpper(lLen-1).CompareCauset(sc, rh.GetLower(0))
   628  	if err != nil {
   629  		return nil, errors.Trace(err)
   630  	}
   631  	offset := int64(0)
   632  	if cmp == 0 {
   633  		lh.NDV--
   634  		lh.uFIDelateLastBucket(rh.GetUpper(0), lh.Buckets[lLen-1].Count+rh.Buckets[0].Count, rh.Buckets[0].Repeat)
   635  		offset = rh.Buckets[0].Count
   636  		rh.popFirstBucket()
   637  	}
   638  	for lh.Len() > bucketSize {
   639  		lh.mergeBuckets(lh.Len() - 1)
   640  	}
   641  	if rh.Len() == 0 {
   642  		return lh, nil
   643  	}
   644  	for rh.Len() > bucketSize {
   645  		rh.mergeBuckets(rh.Len() - 1)
   646  	}
   647  	lCount := lh.Buckets[lh.Len()-1].Count
   648  	rCount := rh.Buckets[rh.Len()-1].Count - offset
   649  	lAvg := float64(lCount) / float64(lh.Len())
   650  	rAvg := float64(rCount) / float64(rh.Len())
   651  	for lh.Len() > 1 && lAvg*2 <= rAvg {
   652  		lh.mergeBuckets(lh.Len() - 1)
   653  		lAvg *= 2
   654  	}
   655  	for rh.Len() > 1 && rAvg*2 <= lAvg {
   656  		rh.mergeBuckets(rh.Len() - 1)
   657  		rAvg *= 2
   658  	}
   659  	for i := 0; i < rh.Len(); i++ {
   660  		lh.AppendBucket(rh.GetLower(i), rh.GetUpper(i), rh.Buckets[i].Count+lCount-offset, rh.Buckets[i].Repeat)
   661  	}
   662  	for lh.Len() > bucketSize {
   663  		lh.mergeBuckets(lh.Len() - 1)
   664  	}
   665  	return lh, nil
   666  }
   667  
   668  // AvgCountPerNotNullValue gets the average event count per value by the data of histogram.
   669  func (hg *Histogram) AvgCountPerNotNullValue(totalCount int64) float64 {
   670  	factor := hg.GetIncreaseFactor(totalCount)
   671  	totalNotNull := hg.notNullCount() * factor
   672  	curNDV := float64(hg.NDV) * factor
   673  	curNDV = math.Max(curNDV, 1)
   674  	return totalNotNull / curNDV
   675  }
   676  
   677  func (hg *Histogram) outOfRange(val types.Causet) bool {
   678  	if hg.Len() == 0 {
   679  		return true
   680  	}
   681  	return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 ||
   682  		chunk.Compare(hg.Bounds.GetRow(hg.Bounds.NumRows()-1), 0, &val) < 0
   683  }
   684  
   685  // Copy deep copies the histogram.
   686  func (hg *Histogram) Copy() *Histogram {
   687  	newHist := *hg
   688  	newHist.Bounds = hg.Bounds.CopyConstruct()
   689  	newHist.Buckets = make([]Bucket, 0, len(hg.Buckets))
   690  	newHist.Buckets = append(newHist.Buckets, hg.Buckets...)
   691  	return &newHist
   692  }
   693  
   694  // RemoveUpperBound removes the upper bound from histogram.
   695  // It is used when merge stats for incremental analyze.
   696  func (hg *Histogram) RemoveUpperBound() *Histogram {
   697  	hg.Buckets[hg.Len()-1].Count -= hg.Buckets[hg.Len()-1].Repeat
   698  	hg.Buckets[hg.Len()-1].Repeat = 0
   699  	return hg
   700  }
   701  
   702  // TruncateHistogram truncates the histogram to `numBkt` buckets.
   703  func (hg *Histogram) TruncateHistogram(numBkt int) *Histogram {
   704  	hist := hg.Copy()
   705  	hist.Buckets = hist.Buckets[:numBkt]
   706  	hist.Bounds.TruncateTo(numBkt * 2)
   707  	return hist
   708  }
   709  
   710  // ErrorRate is the error rate of estimate event count by bucket and cm sketch.
   711  type ErrorRate struct {
   712  	ErrorTotal float64
   713  	QueryTotal int64
   714  }
   715  
   716  // MaxErrorRate is the max error rate of estimate event count of a not pseudo column.
   717  // If the causet is pseudo, but the average error rate is less than MaxErrorRate,
   718  // then the column is not pseudo.
   719  const MaxErrorRate = 0.25
   720  
   721  // NotAccurate is true when the total of query is zero or the average error
   722  // rate is greater than MaxErrorRate.
   723  func (e *ErrorRate) NotAccurate() bool {
   724  	if e.QueryTotal == 0 {
   725  		return true
   726  	}
   727  	return e.ErrorTotal/float64(e.QueryTotal) > MaxErrorRate
   728  }
   729  
   730  // UFIDelate uFIDelates the ErrorRate.
   731  func (e *ErrorRate) UFIDelate(rate float64) {
   732  	e.QueryTotal++
   733  	e.ErrorTotal += rate
   734  }
   735  
   736  // Merge range merges two ErrorRate.
   737  func (e *ErrorRate) Merge(rate *ErrorRate) {
   738  	e.QueryTotal += rate.QueryTotal
   739  	e.ErrorTotal += rate.ErrorTotal
   740  }
   741  
   742  // DeferredCauset represents a column histogram.
   743  type DeferredCauset struct {
   744  	Histogram
   745  	*CMSketch
   746  	PhysicalID int64
   747  	Count      int64
   748  	Info       *perceptron.DeferredCausetInfo
   749  	IsHandle   bool
   750  	ErrorRate
   751  	Flag           int64
   752  	LastAnalyzePos types.Causet
   753  }
   754  
   755  func (c *DeferredCauset) String() string {
   756  	return c.Histogram.ToString(0)
   757  }
   758  
   759  // MemoryUsage returns the total memory usage of Histogram and CMSketch in DeferredCauset.
   760  // We ignore the size of other spacetimedata in DeferredCauset
   761  func (c *DeferredCauset) MemoryUsage() (sum int64) {
   762  	sum = c.Histogram.MemoryUsage()
   763  	if c.CMSketch != nil {
   764  		sum += c.CMSketch.MemoryUsage()
   765  	}
   766  	return
   767  }
   768  
   769  // HistogramNeededDeferredCausets stores the columns whose Histograms need to be loaded from physical ekv layer.
   770  // Currently, we only load index/pk's Histogram from ekv automatically. DeferredCausets' are loaded by needs.
   771  var HistogramNeededDeferredCausets = neededDeferredCausetMap{defcaus: map[blockDeferredCausetID]struct{}{}}
   772  
   773  // IsInvalid checks if this column is invalid. If this column has histogram but not loaded yet, then we mark it
   774  // as need histogram.
   775  func (c *DeferredCauset) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool {
   776  	if collPseudo && c.NotAccurate() {
   777  		return true
   778  	}
   779  	if c.NDV > 0 && c.Len() == 0 && sc != nil {
   780  		sc.SetHistogramsNotLoad()
   781  		HistogramNeededDeferredCausets.insert(blockDeferredCausetID{TableID: c.PhysicalID, DeferredCausetID: c.Info.ID})
   782  	}
   783  	return c.TotalRowCount() == 0 || (c.NDV > 0 && c.Len() == 0)
   784  }
   785  
   786  func (c *DeferredCauset) equalRowCount(sc *stmtctx.StatementContext, val types.Causet, modifyCount int64) (float64, error) {
   787  	if val.IsNull() {
   788  		return float64(c.NullCount), nil
   789  	}
   790  	// All the values are null.
   791  	if c.Histogram.Bounds.NumRows() == 0 {
   792  		return 0.0, nil
   793  	}
   794  	if c.NDV > 0 && c.outOfRange(val) {
   795  		return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
   796  	}
   797  	if c.CMSketch != nil {
   798  		count, err := c.CMSketch.queryValue(sc, val)
   799  		return float64(count), errors.Trace(err)
   800  	}
   801  	return c.Histogram.equalRowCount(val), nil
   802  }
   803  
   804  // GetDeferredCausetRowCount estimates the event count by a slice of Range.
   805  func (c *DeferredCauset) GetDeferredCausetRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64, pkIsHandle bool) (float64, error) {
   806  	var rowCount float64
   807  	for _, rg := range ranges {
   808  		highVal := *rg.HighVal[0].Clone()
   809  		lowVal := *rg.LowVal[0].Clone()
   810  		if highVal.HoTT() == types.HoTTString {
   811  			highVal.SetBytesAsString(collate.GetDefCauslator(
   812  				highVal.DefCauslation()).Key(highVal.GetString()),
   813  				highVal.DefCauslation(),
   814  				uint32(highVal.Length()),
   815  			)
   816  		}
   817  		if lowVal.HoTT() == types.HoTTString {
   818  			lowVal.SetBytesAsString(collate.GetDefCauslator(
   819  				lowVal.DefCauslation()).Key(lowVal.GetString()),
   820  				lowVal.DefCauslation(),
   821  				uint32(lowVal.Length()),
   822  			)
   823  		}
   824  		cmp, err := lowVal.CompareCauset(sc, &highVal)
   825  		if err != nil {
   826  			return 0, errors.Trace(err)
   827  		}
   828  		if cmp == 0 {
   829  			// the point case.
   830  			if !rg.LowExclude && !rg.HighExclude {
   831  				// In this case, the event count is at most 1.
   832  				if pkIsHandle {
   833  					rowCount += 1
   834  					continue
   835  				}
   836  				var cnt float64
   837  				cnt, err = c.equalRowCount(sc, lowVal, modifyCount)
   838  				if err != nil {
   839  					return 0, errors.Trace(err)
   840  				}
   841  				rowCount += cnt
   842  			}
   843  			continue
   844  		}
   845  		rangeVals := enumRangeValues(lowVal, highVal, rg.LowExclude, rg.HighExclude)
   846  		// The small range case.
   847  		if rangeVals != nil {
   848  			for _, val := range rangeVals {
   849  				cnt, err := c.equalRowCount(sc, val, modifyCount)
   850  				if err != nil {
   851  					return 0, err
   852  				}
   853  				rowCount += cnt
   854  			}
   855  			continue
   856  		}
   857  		// The interval case.
   858  		cnt := c.BetweenRowCount(lowVal, highVal)
   859  		if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
   860  			cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
   861  		}
   862  		// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
   863  		// Note that, `cnt` does not include null values, we need specially handle cases
   864  		// where null is the lower bound.
   865  		if rg.LowExclude && !lowVal.IsNull() {
   866  			lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount)
   867  			if err != nil {
   868  				return 0, errors.Trace(err)
   869  			}
   870  			cnt -= lowCnt
   871  		}
   872  		if !rg.LowExclude && lowVal.IsNull() {
   873  			cnt += float64(c.NullCount)
   874  		}
   875  		if !rg.HighExclude {
   876  			highCnt, err := c.equalRowCount(sc, highVal, modifyCount)
   877  			if err != nil {
   878  				return 0, errors.Trace(err)
   879  			}
   880  			cnt += highCnt
   881  		}
   882  		rowCount += cnt
   883  	}
   884  	if rowCount > c.TotalRowCount() {
   885  		rowCount = c.TotalRowCount()
   886  	} else if rowCount < 0 {
   887  		rowCount = 0
   888  	}
   889  	return rowCount, nil
   890  }
   891  
   892  // Index represents an index histogram.
   893  type Index struct {
   894  	Histogram
   895  	*CMSketch
   896  	ErrorRate
   897  	StatsVer       int64 // StatsVer is the version of the current stats, used to maintain compatibility
   898  	Info           *perceptron.IndexInfo
   899  	Flag           int64
   900  	LastAnalyzePos types.Causet
   901  }
   902  
   903  func (idx *Index) String() string {
   904  	return idx.Histogram.ToString(len(idx.Info.DeferredCausets))
   905  }
   906  
   907  // IsInvalid checks if this index is invalid.
   908  func (idx *Index) IsInvalid(collPseudo bool) bool {
   909  	return (collPseudo && idx.NotAccurate()) || idx.TotalRowCount() == 0
   910  }
   911  
   912  // MemoryUsage returns the total memory usage of a Histogram and CMSketch in Index.
   913  // We ignore the size of other spacetimedata in Index.
   914  func (idx *Index) MemoryUsage() (sum int64) {
   915  	sum = idx.Histogram.MemoryUsage()
   916  	if idx.CMSketch != nil {
   917  		sum += idx.CMSketch.MemoryUsage()
   918  	}
   919  	return
   920  }
   921  
   922  var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewCauset(nil))
   923  
   924  func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) (float64, error) {
   925  	if len(idx.Info.DeferredCausets) == 1 {
   926  		if bytes.Equal(b, nullKeyBytes) {
   927  			return float64(idx.NullCount), nil
   928  		}
   929  	}
   930  	val := types.NewBytesCauset(b)
   931  	if idx.NDV > 0 && idx.outOfRange(val) {
   932  		return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
   933  	}
   934  	if idx.CMSketch != nil {
   935  		return float64(idx.CMSketch.QueryBytes(b)), nil
   936  	}
   937  	return idx.Histogram.equalRowCount(val), nil
   938  }
   939  
   940  // GetRowCount returns the event count of the given ranges.
   941  // It uses the modifyCount to adjust the influence of modifications on the causet.
   942  func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
   943  	totalCount := float64(0)
   944  	isSingleDefCaus := len(idx.Info.DeferredCausets) == 1
   945  	for _, indexRange := range indexRanges {
   946  		lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
   947  		if err != nil {
   948  			return 0, err
   949  		}
   950  		rb, err := codec.EncodeKey(sc, nil, indexRange.HighVal...)
   951  		if err != nil {
   952  			return 0, err
   953  		}
   954  		fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.DeferredCausets)
   955  		if bytes.Equal(lb, rb) {
   956  			if indexRange.LowExclude || indexRange.HighExclude {
   957  				continue
   958  			}
   959  			if fullLen {
   960  				// At most 1 in this case.
   961  				if idx.Info.Unique {
   962  					totalCount += 1
   963  					continue
   964  				}
   965  				count, err := idx.equalRowCount(sc, lb, modifyCount)
   966  				if err != nil {
   967  					return 0, err
   968  				}
   969  				totalCount += count
   970  				continue
   971  			}
   972  		}
   973  		if indexRange.LowExclude {
   974  			lb = ekv.Key(lb).PrefixNext()
   975  		}
   976  		if !indexRange.HighExclude {
   977  			rb = ekv.Key(rb).PrefixNext()
   978  		}
   979  		l := types.NewBytesCauset(lb)
   980  		r := types.NewBytesCauset(rb)
   981  		totalCount += idx.BetweenRowCount(l, r)
   982  		lowIsNull := bytes.Equal(lb, nullKeyBytes)
   983  		if (idx.outOfRange(l) && !(isSingleDefCaus && lowIsNull)) || idx.outOfRange(r) {
   984  			totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount()
   985  		}
   986  		if isSingleDefCaus && lowIsNull {
   987  			totalCount += float64(idx.NullCount)
   988  		}
   989  	}
   990  	if totalCount > idx.TotalRowCount() {
   991  		totalCount = idx.TotalRowCount()
   992  	}
   993  	return totalCount, nil
   994  }
   995  
   996  type countByRangeFunc = func(*stmtctx.StatementContext, int64, []*ranger.Range) (float64, error)
   997  
   998  // newHistogramBySelectivity fulfills the content of new histogram by the given selectivity result.
   999  // TODO: Causet is not efficient, try to avoid using it here.
  1000  //  Also, there're redundant calculation with Selectivity(). We need to reduce it too.
  1001  func newHistogramBySelectivity(sc *stmtctx.StatementContext, histID int64, oldHist, newHist *Histogram, ranges []*ranger.Range, cntByRangeFunc countByRangeFunc) error {
  1002  	cntPerVal := int64(oldHist.AvgCountPerNotNullValue(int64(oldHist.TotalRowCount())))
  1003  	var totCnt int64
  1004  	for boundIdx, ranIdx, highRangeIdx := 0, 0, 0; boundIdx < oldHist.Bounds.NumRows() && ranIdx < len(ranges); boundIdx, ranIdx = boundIdx+2, highRangeIdx {
  1005  		for highRangeIdx < len(ranges) && chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx].HighVal[0]) >= 0 {
  1006  			highRangeIdx++
  1007  		}
  1008  		if boundIdx+2 >= oldHist.Bounds.NumRows() && highRangeIdx < len(ranges) && ranges[highRangeIdx].HighVal[0].HoTT() == types.HoTTMaxValue {
  1009  			highRangeIdx++
  1010  		}
  1011  		if ranIdx == highRangeIdx {
  1012  			continue
  1013  		}
  1014  		cnt, err := cntByRangeFunc(sc, histID, ranges[ranIdx:highRangeIdx])
  1015  		// This should not happen.
  1016  		if err != nil {
  1017  			return err
  1018  		}
  1019  		if cnt == 0 {
  1020  			continue
  1021  		}
  1022  		if int64(cnt) > oldHist.bucketCount(boundIdx/2) {
  1023  			cnt = float64(oldHist.bucketCount(boundIdx / 2))
  1024  		}
  1025  		newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx))
  1026  		newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx + 1))
  1027  		totCnt += int64(cnt)
  1028  		bkt := Bucket{Count: totCnt}
  1029  		if chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx-1].HighVal[0]) == 0 && !ranges[highRangeIdx-1].HighExclude {
  1030  			bkt.Repeat = cntPerVal
  1031  		}
  1032  		newHist.Buckets = append(newHist.Buckets, bkt)
  1033  		switch newHist.Tp.EvalType() {
  1034  		case types.ETString, types.ETDecimal, types.ETDatetime, types.ETTimestamp:
  1035  			newHist.scalars = append(newHist.scalars, oldHist.scalars[boundIdx/2])
  1036  		}
  1037  	}
  1038  	return nil
  1039  }
  1040  
  1041  func (idx *Index) newIndexBySelectivity(sc *stmtctx.StatementContext, statsNode *StatsNode) (*Index, error) {
  1042  	var (
  1043  		ranLowEncode, ranHighEncode []byte
  1044  		err                         error
  1045  	)
  1046  	newIndexHist := &Index{Info: idx.Info, StatsVer: idx.StatsVer, CMSketch: idx.CMSketch}
  1047  	newIndexHist.Histogram = *NewHistogram(idx.ID, int64(float64(idx.NDV)*statsNode.Selectivity), 0, 0, types.NewFieldType(allegrosql.TypeBlob), chunk.InitialCapacity, 0)
  1048  
  1049  	lowBucketIdx, highBucketIdx := 0, 0
  1050  	var totCnt int64
  1051  
  1052  	// Bucket bound of index is encoded one, so we need to decode it if we want to calculate the fraction accurately.
  1053  	// TODO: enhance its calculation.
  1054  	// Now just remove the bucket that no range fell in.
  1055  	for _, ran := range statsNode.Ranges {
  1056  		lowBucketIdx = highBucketIdx
  1057  		ranLowEncode, ranHighEncode, err = ran.Encode(sc, ranLowEncode, ranHighEncode)
  1058  		if err != nil {
  1059  			return nil, err
  1060  		}
  1061  		for ; highBucketIdx < idx.Len(); highBucketIdx++ {
  1062  			// Encoded value can only go to its next quickly. So ranHighEncode is actually range.HighVal's PrefixNext value.
  1063  			// So the Bound should also go to its PrefixNext.
  1064  			bucketLowerEncoded := idx.Bounds.GetRow(highBucketIdx * 2).GetBytes(0)
  1065  			if bytes.Compare(ranHighEncode, ekv.Key(bucketLowerEncoded).PrefixNext()) < 0 {
  1066  				break
  1067  			}
  1068  		}
  1069  		for ; lowBucketIdx < highBucketIdx; lowBucketIdx++ {
  1070  			bucketUpperEncoded := idx.Bounds.GetRow(lowBucketIdx*2 + 1).GetBytes(0)
  1071  			if bytes.Compare(ranLowEncode, bucketUpperEncoded) <= 0 {
  1072  				break
  1073  			}
  1074  		}
  1075  		if lowBucketIdx >= idx.Len() {
  1076  			break
  1077  		}
  1078  		for i := lowBucketIdx; i < highBucketIdx; i++ {
  1079  			newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i * 2))
  1080  			newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i*2 + 1))
  1081  			totCnt += idx.bucketCount(i)
  1082  			newIndexHist.Buckets = append(newIndexHist.Buckets, Bucket{Repeat: idx.Buckets[i].Repeat, Count: totCnt})
  1083  			newIndexHist.scalars = append(newIndexHist.scalars, idx.scalars[i])
  1084  		}
  1085  	}
  1086  	return newIndexHist, nil
  1087  }
  1088  
  1089  // NewHistDefCauslBySelectivity creates new HistDefCausl by the given statsNodes.
  1090  func (coll *HistDefCausl) NewHistDefCauslBySelectivity(sc *stmtctx.StatementContext, statsNodes []*StatsNode) *HistDefCausl {
  1091  	newDefCausl := &HistDefCausl{
  1092  		DeferredCausets:       make(map[int64]*DeferredCauset),
  1093  		Indices:               make(map[int64]*Index),
  1094  		Idx2DeferredCausetIDs: coll.Idx2DeferredCausetIDs,
  1095  		DefCausID2IdxID:       coll.DefCausID2IdxID,
  1096  		Count:                 coll.Count,
  1097  	}
  1098  	for _, node := range statsNodes {
  1099  		if node.Tp == IndexType {
  1100  			idxHist, ok := coll.Indices[node.ID]
  1101  			if !ok {
  1102  				continue
  1103  			}
  1104  			newIdxHist, err := idxHist.newIndexBySelectivity(sc, node)
  1105  			if err != nil {
  1106  				logutil.BgLogger().Warn("[Histogram-in-plan]: something wrong happened when calculating event count, "+
  1107  					"failed to build histogram for index %v of causet %v",
  1108  					zap.String("index", idxHist.Info.Name.O), zap.String("causet", idxHist.Info.Block.O), zap.Error(err))
  1109  				continue
  1110  			}
  1111  			newDefCausl.Indices[node.ID] = newIdxHist
  1112  			continue
  1113  		}
  1114  		oldDefCaus, ok := coll.DeferredCausets[node.ID]
  1115  		if !ok {
  1116  			continue
  1117  		}
  1118  		newDefCaus := &DeferredCauset{
  1119  			PhysicalID: oldDefCaus.PhysicalID,
  1120  			Info:       oldDefCaus.Info,
  1121  			IsHandle:   oldDefCaus.IsHandle,
  1122  			CMSketch:   oldDefCaus.CMSketch,
  1123  		}
  1124  		newDefCaus.Histogram = *NewHistogram(oldDefCaus.ID, int64(float64(oldDefCaus.NDV)*node.Selectivity), 0, 0, oldDefCaus.Tp, chunk.InitialCapacity, 0)
  1125  		var err error
  1126  		splitRanges, ok := oldDefCaus.Histogram.SplitRange(sc, node.Ranges, false)
  1127  		if !ok {
  1128  			logutil.BgLogger().Warn("[Histogram-in-plan]: the type of histogram and ranges mismatch")
  1129  			continue
  1130  		}
  1131  		// Deal with some corner case.
  1132  		if len(splitRanges) > 0 {
  1133  			// Deal with NULL values.
  1134  			if splitRanges[0].LowVal[0].IsNull() {
  1135  				newDefCaus.NullCount = oldDefCaus.NullCount
  1136  				if splitRanges[0].HighVal[0].IsNull() {
  1137  					splitRanges = splitRanges[1:]
  1138  				} else {
  1139  					splitRanges[0].LowVal[0].SetMinNotNull()
  1140  				}
  1141  			}
  1142  		}
  1143  		if oldDefCaus.IsHandle {
  1144  			err = newHistogramBySelectivity(sc, node.ID, &oldDefCaus.Histogram, &newDefCaus.Histogram, splitRanges, coll.GetRowCountByIntDeferredCausetRanges)
  1145  		} else {
  1146  			err = newHistogramBySelectivity(sc, node.ID, &oldDefCaus.Histogram, &newDefCaus.Histogram, splitRanges, coll.GetRowCountByDeferredCausetRanges)
  1147  		}
  1148  		if err != nil {
  1149  			logutil.BgLogger().Warn("[Histogram-in-plan]: something wrong happened when calculating event count",
  1150  				zap.Error(err))
  1151  			continue
  1152  		}
  1153  		newDefCausl.DeferredCausets[node.ID] = newDefCaus
  1154  	}
  1155  	for id, idx := range coll.Indices {
  1156  		_, ok := newDefCausl.Indices[id]
  1157  		if !ok {
  1158  			newDefCausl.Indices[id] = idx
  1159  		}
  1160  	}
  1161  	for id, col := range coll.DeferredCausets {
  1162  		_, ok := newDefCausl.DeferredCausets[id]
  1163  		if !ok {
  1164  			newDefCausl.DeferredCausets[id] = col
  1165  		}
  1166  	}
  1167  	return newDefCausl
  1168  }
  1169  
  1170  func (idx *Index) outOfRange(val types.Causet) bool {
  1171  	if idx.Histogram.Len() == 0 {
  1172  		return true
  1173  	}
  1174  	withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
  1175  		matchPrefix(idx.Bounds.GetRow(0), 0, &val)
  1176  	withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
  1177  	return !withInLowBoundOrPrefixMatch || !withInHighBound
  1178  }
  1179  
  1180  // matchPrefix checks whether ad is the prefix of value
  1181  func matchPrefix(event chunk.Row, colIdx int, ad *types.Causet) bool {
  1182  	switch ad.HoTT() {
  1183  	case types.HoTTString, types.HoTTBytes, types.HoTTBinaryLiteral, types.HoTTMysqlBit:
  1184  		return strings.HasPrefix(event.GetString(colIdx), ad.GetString())
  1185  	}
  1186  	return false
  1187  }
  1188  
  1189  type dataCnt struct {
  1190  	data []byte
  1191  	cnt  uint64
  1192  }
  1193  
  1194  func getIndexPrefixLens(data []byte, numDefCauss int) (prefixLens []int, err error) {
  1195  	prefixLens = make([]int, 0, numDefCauss)
  1196  	var colData []byte
  1197  	prefixLen := 0
  1198  	for len(data) > 0 {
  1199  		colData, data, err = codec.CutOne(data)
  1200  		if err != nil {
  1201  			return nil, err
  1202  		}
  1203  		prefixLen += len(colData)
  1204  		prefixLens = append(prefixLens, prefixLen)
  1205  	}
  1206  	return prefixLens, nil
  1207  }
  1208  
  1209  // ExtractTopN extracts topn from histogram.
  1210  func (hg *Histogram) ExtractTopN(cms *CMSketch, numDefCauss int, numTopN uint32) error {
  1211  	if hg.Len() == 0 || cms == nil || numTopN == 0 {
  1212  		return nil
  1213  	}
  1214  	dataSet := make(map[string]struct{}, hg.Bounds.NumRows())
  1215  	dataCnts := make([]dataCnt, 0, hg.Bounds.NumRows())
  1216  	hg.PreCalculateScalar()
  1217  	// Set a limit on the frequency of boundary values to avoid extract values with low frequency.
  1218  	limit := hg.notNullCount() / float64(hg.Len())
  1219  	// Since our histogram are equal depth, they must occurs on the boundaries of buckets.
  1220  	for i := 0; i < hg.Bounds.NumRows(); i++ {
  1221  		data := hg.Bounds.GetRow(i).GetBytes(0)
  1222  		prefixLens, err := getIndexPrefixLens(data, numDefCauss)
  1223  		if err != nil {
  1224  			return err
  1225  		}
  1226  		for _, prefixLen := range prefixLens {
  1227  			prefixDefCausData := data[:prefixLen]
  1228  			_, ok := dataSet[string(prefixDefCausData)]
  1229  			if ok {
  1230  				continue
  1231  			}
  1232  			dataSet[string(prefixDefCausData)] = struct{}{}
  1233  			res := hg.BetweenRowCount(types.NewBytesCauset(prefixDefCausData), types.NewBytesCauset(ekv.Key(prefixDefCausData).PrefixNext()))
  1234  			if res >= limit {
  1235  				dataCnts = append(dataCnts, dataCnt{prefixDefCausData, uint64(res)})
  1236  			}
  1237  		}
  1238  	}
  1239  	sort.SliceSblock(dataCnts, func(i, j int) bool { return dataCnts[i].cnt >= dataCnts[j].cnt })
  1240  	if len(dataCnts) > int(numTopN) {
  1241  		dataCnts = dataCnts[:numTopN]
  1242  	}
  1243  	cms.topN = make(map[uint64][]*TopNMeta, len(dataCnts))
  1244  	for _, dataCnt := range dataCnts {
  1245  		h1, h2 := murmur3.Sum128(dataCnt.data)
  1246  		realCnt := cms.queryHashValue(h1, h2)
  1247  		cms.subValue(h1, h2, realCnt)
  1248  		cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, dataCnt.data, realCnt})
  1249  	}
  1250  	return nil
  1251  }