github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/table.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"fmt"
    18  	"math"
    19  	"sort"
    20  	"strings"
    21  	"sync"
    22  
    23  	"github.com/cznic/mathutil"
    24  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    25  	"github.com/whtcorpsinc/BerolinaSQL/perceptron"
    26  	"github.com/whtcorpsinc/errors"
    27  	"github.com/whtcorpsinc/milevadb/blockcodec"
    28  	"github.com/whtcorpsinc/milevadb/ekv"
    29  	"github.com/whtcorpsinc/milevadb/memex"
    30  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    31  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    32  	"github.com/whtcorpsinc/milevadb/soliton/ranger"
    33  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    34  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    35  	"github.com/whtcorpsinc/milevadb/types"
    36  	"go.uber.org/atomic"
    37  )
    38  
    39  const (
    40  	pseudoEqualRate   = 1000
    41  	pseudoLessRate    = 3
    42  	pseudoBetweenRate = 40
    43  	pseudoDefCausSize = 8.0
    44  
    45  	outOfRangeBetweenRate = 100
    46  )
    47  
    48  const (
    49  	// PseudoVersion means the pseudo statistics version is 0.
    50  	PseudoVersion uint64 = 0
    51  
    52  	// PseudoRowCount export for other pkg to use.
    53  	// When we haven't analyzed a causet, we use pseudo statistics to estimate costs.
    54  	// It has event count 10000, equal condition selects 1/1000 of total rows, less condition selects 1/3 of total rows,
    55  	// between condition selects 1/40 of total rows.
    56  	PseudoRowCount = 10000
    57  )
    58  
    59  // Block represents statistics for a causet.
    60  type Block struct {
    61  	HistDefCausl
    62  	Version       uint64
    63  	Name          string
    64  	ExtendedStats *ExtendedStatsDefCausl
    65  }
    66  
    67  // ExtendedStatsKey is the key for cached item of a allegrosql.stats_extended record.
    68  type ExtendedStatsKey struct {
    69  	StatsName string
    70  	EDB       string
    71  }
    72  
    73  // ExtendedStatsItem is the cached item of a allegrosql.stats_extended record.
    74  type ExtendedStatsItem struct {
    75  	DefCausIDs []int64
    76  	Tp         uint8
    77  	ScalarVals float64
    78  	StringVals string
    79  }
    80  
    81  // ExtendedStatsDefCausl is a collection of cached items for allegrosql.stats_extended records.
    82  type ExtendedStatsDefCausl struct {
    83  	Stats                map[ExtendedStatsKey]*ExtendedStatsItem
    84  	LastUFIDelateVersion uint64
    85  }
    86  
    87  // NewExtendedStatsDefCausl allocate an ExtendedStatsDefCausl struct.
    88  func NewExtendedStatsDefCausl() *ExtendedStatsDefCausl {
    89  	return &ExtendedStatsDefCausl{Stats: make(map[ExtendedStatsKey]*ExtendedStatsItem)}
    90  }
    91  
    92  // HistDefCausl is a collection of histogram. It collects enough information for plan to calculate the selectivity.
    93  type HistDefCausl struct {
    94  	PhysicalID      int64
    95  	DeferredCausets map[int64]*DeferredCauset
    96  	Indices         map[int64]*Index
    97  	// Idx2DeferredCausetIDs maps the index id to its column ids. It's used to calculate the selectivity in causet.
    98  	Idx2DeferredCausetIDs map[int64][]int64
    99  	// DefCausID2IdxID maps the column id to index id whose first column is it. It's used to calculate the selectivity in causet.
   100  	DefCausID2IdxID map[int64]int64
   101  	Count           int64
   102  	ModifyCount     int64 // Total modify count in a causet.
   103  
   104  	// HavePhysicalID is true means this HistDefCausl is from single causet and have its ID's information.
   105  	// The physical id is used when try to load column stats from storage.
   106  	HavePhysicalID bool
   107  	Pseudo         bool
   108  }
   109  
   110  // MemoryUsage returns the total memory usage of this Block.
   111  // it will only calc the size of DeferredCausets and Indices stats data of causet.
   112  // We ignore the size of other spacetimedata in Block
   113  func (t *Block) MemoryUsage() (sum int64) {
   114  	for _, col := range t.DeferredCausets {
   115  		if col != nil {
   116  			sum += col.MemoryUsage()
   117  		}
   118  	}
   119  	for _, index := range t.Indices {
   120  		if index != nil {
   121  			sum += index.MemoryUsage()
   122  		}
   123  	}
   124  	return
   125  }
   126  
   127  // Copy copies the current causet.
   128  func (t *Block) Copy() *Block {
   129  	newHistDefCausl := HistDefCausl{
   130  		PhysicalID:      t.PhysicalID,
   131  		HavePhysicalID:  t.HavePhysicalID,
   132  		Count:           t.Count,
   133  		DeferredCausets: make(map[int64]*DeferredCauset, len(t.DeferredCausets)),
   134  		Indices:         make(map[int64]*Index, len(t.Indices)),
   135  		Pseudo:          t.Pseudo,
   136  		ModifyCount:     t.ModifyCount,
   137  	}
   138  	for id, col := range t.DeferredCausets {
   139  		newHistDefCausl.DeferredCausets[id] = col
   140  	}
   141  	for id, idx := range t.Indices {
   142  		newHistDefCausl.Indices[id] = idx
   143  	}
   144  	nt := &Block{
   145  		HistDefCausl: newHistDefCausl,
   146  		Version:      t.Version,
   147  		Name:         t.Name,
   148  	}
   149  	if t.ExtendedStats != nil {
   150  		newExtStatsDefCausl := &ExtendedStatsDefCausl{
   151  			Stats:                make(map[ExtendedStatsKey]*ExtendedStatsItem),
   152  			LastUFIDelateVersion: t.ExtendedStats.LastUFIDelateVersion,
   153  		}
   154  		for key, item := range t.ExtendedStats.Stats {
   155  			newExtStatsDefCausl.Stats[key] = item
   156  		}
   157  		nt.ExtendedStats = newExtStatsDefCausl
   158  	}
   159  	return nt
   160  }
   161  
   162  // String implements Stringer interface.
   163  func (t *Block) String() string {
   164  	strs := make([]string, 0, len(t.DeferredCausets)+1)
   165  	strs = append(strs, fmt.Sprintf("Block:%d Count:%d", t.PhysicalID, t.Count))
   166  	defcaus := make([]*DeferredCauset, 0, len(t.DeferredCausets))
   167  	for _, col := range t.DeferredCausets {
   168  		defcaus = append(defcaus, col)
   169  	}
   170  	sort.Slice(defcaus, func(i, j int) bool { return defcaus[i].ID < defcaus[j].ID })
   171  	for _, col := range defcaus {
   172  		strs = append(strs, col.String())
   173  	}
   174  	idxs := make([]*Index, 0, len(t.Indices))
   175  	for _, idx := range t.Indices {
   176  		idxs = append(idxs, idx)
   177  	}
   178  	sort.Slice(idxs, func(i, j int) bool { return idxs[i].ID < idxs[j].ID })
   179  	for _, idx := range idxs {
   180  		strs = append(strs, idx.String())
   181  	}
   182  	// TODO: concat content of ExtendedStatsDefCausl
   183  	return strings.Join(strs, "\n")
   184  }
   185  
   186  // IndexStartWithDeferredCauset finds the first index whose first column is the given column.
   187  func (t *Block) IndexStartWithDeferredCauset(colName string) *Index {
   188  	for _, index := range t.Indices {
   189  		if index.Info.DeferredCausets[0].Name.L == colName {
   190  			return index
   191  		}
   192  	}
   193  	return nil
   194  }
   195  
   196  // DeferredCausetByName finds the statistics.DeferredCauset for the given column.
   197  func (t *Block) DeferredCausetByName(colName string) *DeferredCauset {
   198  	for _, c := range t.DeferredCausets {
   199  		if c.Info.Name.L == colName {
   200  			return c
   201  		}
   202  	}
   203  	return nil
   204  }
   205  
   206  type blockDeferredCausetID struct {
   207  	TableID          int64
   208  	DeferredCausetID int64
   209  }
   210  
   211  type neededDeferredCausetMap struct {
   212  	m       sync.Mutex
   213  	defcaus map[blockDeferredCausetID]struct{}
   214  }
   215  
   216  func (n *neededDeferredCausetMap) AllDefCauss() []blockDeferredCausetID {
   217  	n.m.Lock()
   218  	keys := make([]blockDeferredCausetID, 0, len(n.defcaus))
   219  	for key := range n.defcaus {
   220  		keys = append(keys, key)
   221  	}
   222  	n.m.Unlock()
   223  	return keys
   224  }
   225  
   226  func (n *neededDeferredCausetMap) insert(col blockDeferredCausetID) {
   227  	n.m.Lock()
   228  	n.defcaus[col] = struct{}{}
   229  	n.m.Unlock()
   230  }
   231  
   232  func (n *neededDeferredCausetMap) Delete(col blockDeferredCausetID) {
   233  	n.m.Lock()
   234  	delete(n.defcaus, col)
   235  	n.m.Unlock()
   236  }
   237  
   238  // RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid
   239  // and use pseudo estimation.
   240  var RatioOfPseudoEstimate = atomic.NewFloat64(0.7)
   241  
   242  // IsOutdated returns true if the causet stats is outdated.
   243  func (t *Block) IsOutdated() bool {
   244  	if t.Count > 0 && float64(t.ModifyCount)/float64(t.Count) > RatioOfPseudoEstimate.Load() {
   245  		return true
   246  	}
   247  	return false
   248  }
   249  
   250  // DeferredCausetGreaterRowCount estimates the event count where the column greater than value.
   251  func (t *Block) DeferredCausetGreaterRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) float64 {
   252  	c, ok := t.DeferredCausets[colID]
   253  	if !ok || c.IsInvalid(sc, t.Pseudo) {
   254  		return float64(t.Count) / pseudoLessRate
   255  	}
   256  	return c.greaterRowCount(value) * c.GetIncreaseFactor(t.Count)
   257  }
   258  
   259  // DeferredCausetLessRowCount estimates the event count where the column less than value. Note that null values are not counted.
   260  func (t *Block) DeferredCausetLessRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) float64 {
   261  	c, ok := t.DeferredCausets[colID]
   262  	if !ok || c.IsInvalid(sc, t.Pseudo) {
   263  		return float64(t.Count) / pseudoLessRate
   264  	}
   265  	return c.lessRowCount(value) * c.GetIncreaseFactor(t.Count)
   266  }
   267  
   268  // DeferredCausetBetweenRowCount estimates the event count where column greater or equal to a and less than b.
   269  func (t *Block) DeferredCausetBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Causet, colID int64) float64 {
   270  	c, ok := t.DeferredCausets[colID]
   271  	if !ok || c.IsInvalid(sc, t.Pseudo) {
   272  		return float64(t.Count) / pseudoBetweenRate
   273  	}
   274  	count := c.BetweenRowCount(a, b)
   275  	if a.IsNull() {
   276  		count += float64(c.NullCount)
   277  	}
   278  	return count * c.GetIncreaseFactor(t.Count)
   279  }
   280  
   281  // DeferredCausetEqualRowCount estimates the event count where the column equals to value.
   282  func (t *Block) DeferredCausetEqualRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) (float64, error) {
   283  	c, ok := t.DeferredCausets[colID]
   284  	if !ok || c.IsInvalid(sc, t.Pseudo) {
   285  		return float64(t.Count) / pseudoEqualRate, nil
   286  	}
   287  	result, err := c.equalRowCount(sc, value, t.ModifyCount)
   288  	result *= c.GetIncreaseFactor(t.Count)
   289  	return result, errors.Trace(err)
   290  }
   291  
   292  // GetRowCountByIntDeferredCausetRanges estimates the event count by a slice of IntDeferredCausetRange.
   293  func (coll *HistDefCausl) GetRowCountByIntDeferredCausetRanges(sc *stmtctx.StatementContext, colID int64, intRanges []*ranger.Range) (float64, error) {
   294  	c, ok := coll.DeferredCausets[colID]
   295  	if !ok || c.IsInvalid(sc, coll.Pseudo) {
   296  		if len(intRanges) == 0 {
   297  			return 0, nil
   298  		}
   299  		if intRanges[0].LowVal[0].HoTT() == types.HoTTInt64 {
   300  			return getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count)), nil
   301  		}
   302  		return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil
   303  	}
   304  	result, err := c.GetDeferredCausetRowCount(sc, intRanges, coll.ModifyCount, true)
   305  	result *= c.GetIncreaseFactor(coll.Count)
   306  	return result, errors.Trace(err)
   307  }
   308  
   309  // GetRowCountByDeferredCausetRanges estimates the event count by a slice of Range.
   310  func (coll *HistDefCausl) GetRowCountByDeferredCausetRanges(sc *stmtctx.StatementContext, colID int64, colRanges []*ranger.Range) (float64, error) {
   311  	c, ok := coll.DeferredCausets[colID]
   312  	if !ok || c.IsInvalid(sc, coll.Pseudo) {
   313  		return GetPseudoRowCountByDeferredCausetRanges(sc, float64(coll.Count), colRanges, 0)
   314  	}
   315  	result, err := c.GetDeferredCausetRowCount(sc, colRanges, coll.ModifyCount, false)
   316  	result *= c.GetIncreaseFactor(coll.Count)
   317  	return result, errors.Trace(err)
   318  }
   319  
   320  // GetRowCountByIndexRanges estimates the event count by a slice of Range.
   321  func (coll *HistDefCausl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
   322  	idx := coll.Indices[idxID]
   323  	if idx == nil || idx.IsInvalid(coll.Pseudo) {
   324  		defcausLen := -1
   325  		if idx != nil && idx.Info.Unique {
   326  			defcausLen = len(idx.Info.DeferredCausets)
   327  		}
   328  		return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), defcausLen)
   329  	}
   330  	var result float64
   331  	var err error
   332  	if idx.CMSketch != nil && idx.StatsVer == Version1 {
   333  		result, err = coll.getIndexRowCount(sc, idxID, indexRanges)
   334  	} else {
   335  		result, err = idx.GetRowCount(sc, indexRanges, coll.ModifyCount)
   336  	}
   337  	result *= idx.GetIncreaseFactor(coll.Count)
   338  	return result, errors.Trace(err)
   339  }
   340  
   341  // PseudoAvgCountPerValue gets a pseudo average count if histogram not exists.
   342  func (t *Block) PseudoAvgCountPerValue() float64 {
   343  	return float64(t.Count) / pseudoEqualRate
   344  }
   345  
   346  // GetOrdinalOfRangeCond gets the ordinal of the position range condition,
   347  // if not exist, it returns the end position.
   348  func GetOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int {
   349  	for i := range ran.LowVal {
   350  		a, b := ran.LowVal[i], ran.HighVal[i]
   351  		cmp, err := a.CompareCauset(sc, &b)
   352  		if err != nil {
   353  			return 0
   354  		}
   355  		if cmp != 0 {
   356  			return i
   357  		}
   358  	}
   359  	return len(ran.LowVal)
   360  }
   361  
   362  // ID2UniqueID generates a new HistDefCausl whose `DeferredCausets` is built from UniqueID of given columns.
   363  func (coll *HistDefCausl) ID2UniqueID(columns []*memex.DeferredCauset) *HistDefCausl {
   364  	defcaus := make(map[int64]*DeferredCauset)
   365  	for _, col := range columns {
   366  		colHist, ok := coll.DeferredCausets[col.ID]
   367  		if ok {
   368  			defcaus[col.UniqueID] = colHist
   369  		}
   370  	}
   371  	newDefCausl := &HistDefCausl{
   372  		PhysicalID:      coll.PhysicalID,
   373  		HavePhysicalID:  coll.HavePhysicalID,
   374  		Pseudo:          coll.Pseudo,
   375  		Count:           coll.Count,
   376  		ModifyCount:     coll.ModifyCount,
   377  		DeferredCausets: defcaus,
   378  	}
   379  	return newDefCausl
   380  }
   381  
   382  // GenerateHistDefCauslFromDeferredCausetInfo generates a new HistDefCausl whose DefCausID2IdxID and IdxID2DefCausIDs is built from the given parameter.
   383  func (coll *HistDefCausl) GenerateHistDefCauslFromDeferredCausetInfo(infos []*perceptron.DeferredCausetInfo, columns []*memex.DeferredCauset) *HistDefCausl {
   384  	newDefCausHistMap := make(map[int64]*DeferredCauset)
   385  	colInfoID2UniqueID := make(map[int64]int64, len(columns))
   386  	colNames2UniqueID := make(map[string]int64)
   387  	for _, col := range columns {
   388  		colInfoID2UniqueID[col.ID] = col.UniqueID
   389  	}
   390  	for _, colInfo := range infos {
   391  		uniqueID, ok := colInfoID2UniqueID[colInfo.ID]
   392  		if ok {
   393  			colNames2UniqueID[colInfo.Name.L] = uniqueID
   394  		}
   395  	}
   396  	for id, colHist := range coll.DeferredCausets {
   397  		uniqueID, ok := colInfoID2UniqueID[id]
   398  		// DefCauslect the statistics by the given columns.
   399  		if ok {
   400  			newDefCausHistMap[uniqueID] = colHist
   401  		}
   402  	}
   403  	newIdxHistMap := make(map[int64]*Index)
   404  	idx2DeferredCausets := make(map[int64][]int64)
   405  	colID2IdxID := make(map[int64]int64)
   406  	for _, idxHist := range coll.Indices {
   407  		ids := make([]int64, 0, len(idxHist.Info.DeferredCausets))
   408  		for _, idxDefCaus := range idxHist.Info.DeferredCausets {
   409  			uniqueID, ok := colNames2UniqueID[idxDefCaus.Name.L]
   410  			if !ok {
   411  				break
   412  			}
   413  			ids = append(ids, uniqueID)
   414  		}
   415  		// If the length of the id list is 0, this index won't be used in this query.
   416  		if len(ids) == 0 {
   417  			continue
   418  		}
   419  		colID2IdxID[ids[0]] = idxHist.ID
   420  		newIdxHistMap[idxHist.ID] = idxHist
   421  		idx2DeferredCausets[idxHist.ID] = ids
   422  	}
   423  	newDefCausl := &HistDefCausl{
   424  		PhysicalID:            coll.PhysicalID,
   425  		HavePhysicalID:        coll.HavePhysicalID,
   426  		Pseudo:                coll.Pseudo,
   427  		Count:                 coll.Count,
   428  		ModifyCount:           coll.ModifyCount,
   429  		DeferredCausets:       newDefCausHistMap,
   430  		Indices:               newIdxHistMap,
   431  		DefCausID2IdxID:       colID2IdxID,
   432  		Idx2DeferredCausetIDs: idx2DeferredCausets,
   433  	}
   434  	return newDefCausl
   435  }
   436  
   437  // isSingleDefCausIdxNullRange checks if a range is [NULL, NULL] on a single-column index.
   438  func isSingleDefCausIdxNullRange(idx *Index, ran *ranger.Range) bool {
   439  	if len(idx.Info.DeferredCausets) > 1 {
   440  		return false
   441  	}
   442  	l, h := ran.LowVal[0], ran.HighVal[0]
   443  	if l.IsNull() && h.IsNull() {
   444  		return true
   445  	}
   446  	return false
   447  }
   448  
   449  // outOfRangeEQSelectivity estimates selectivities for out-of-range values.
   450  // It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
   451  // and has the same distribution with analyzed rows, which means each unique value should have the
   452  // same number of rows(Tot/NDV) of it.
   453  func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
   454  	if modifyRows == 0 {
   455  		return 0 // it must be 0 since the histogram contains the whole data
   456  	}
   457  	if ndv < outOfRangeBetweenRate {
   458  		ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
   459  	}
   460  	selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
   461  	if selectivity*float64(totalRows) > float64(modifyRows) {
   462  		selectivity = float64(modifyRows) / float64(totalRows)
   463  	}
   464  	return selectivity
   465  }
   466  
   467  // getEqualCondSelectivity gets the selectivity of the equal conditions.
   468  func (coll *HistDefCausl) getEqualCondSelectivity(idx *Index, bytes []byte, usedDefCaussLen int) float64 {
   469  	coverAll := len(idx.Info.DeferredCausets) == usedDefCaussLen
   470  	// In this case, the event count is at most 1.
   471  	if idx.Info.Unique && coverAll {
   472  		return 1.0 / float64(idx.TotalRowCount())
   473  	}
   474  	val := types.NewBytesCauset(bytes)
   475  	if idx.outOfRange(val) {
   476  		// When the value is out of range, we could not found this value in the CM Sketch,
   477  		// so we use heuristic methods to estimate the selectivity.
   478  		if idx.NDV > 0 && coverAll {
   479  			return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
   480  		}
   481  		// The equal condition only uses prefix columns of the index.
   482  		colIDs := coll.Idx2DeferredCausetIDs[idx.ID]
   483  		var ndv int64
   484  		for i, colID := range colIDs {
   485  			if i >= usedDefCaussLen {
   486  				break
   487  			}
   488  			ndv = mathutil.MaxInt64(ndv, coll.DeferredCausets[colID].NDV)
   489  		}
   490  		return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
   491  	}
   492  	return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
   493  }
   494  
   495  func (coll *HistDefCausl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
   496  	idx := coll.Indices[idxID]
   497  	totalCount := float64(0)
   498  	for _, ran := range indexRanges {
   499  		rangePosition := GetOrdinalOfRangeCond(sc, ran)
   500  		var rangeVals []types.Causet
   501  		// Try to enum the last range values.
   502  		if rangePosition != len(ran.LowVal) {
   503  			rangeVals = enumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude)
   504  			if rangeVals != nil {
   505  				rangePosition++
   506  			}
   507  		}
   508  		// If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range
   509  		// on single-column index, use previous way as well, because CMSketch does not contain null
   510  		// values in this case.
   511  		if rangePosition == 0 || isSingleDefCausIdxNullRange(idx, ran) {
   512  			count, err := idx.GetRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount)
   513  			if err != nil {
   514  				return 0, errors.Trace(err)
   515  			}
   516  			totalCount += count
   517  			continue
   518  		}
   519  		var selectivity float64
   520  		// use CM Sketch to estimate the equal conditions
   521  		if rangeVals == nil {
   522  			bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
   523  			if err != nil {
   524  				return 0, errors.Trace(err)
   525  			}
   526  			selectivity = coll.getEqualCondSelectivity(idx, bytes, rangePosition)
   527  		} else {
   528  			bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition-1]...)
   529  			if err != nil {
   530  				return 0, errors.Trace(err)
   531  			}
   532  			prefixLen := len(bytes)
   533  			for _, val := range rangeVals {
   534  				bytes = bytes[:prefixLen]
   535  				bytes, err = codec.EncodeKey(sc, bytes, val)
   536  				if err != nil {
   537  					return 0, err
   538  				}
   539  				selectivity += coll.getEqualCondSelectivity(idx, bytes, rangePosition)
   540  			}
   541  		}
   542  		// use histogram to estimate the range condition
   543  		if rangePosition != len(ran.LowVal) {
   544  			rang := ranger.Range{
   545  				LowVal:      []types.Causet{ran.LowVal[rangePosition]},
   546  				LowExclude:  ran.LowExclude,
   547  				HighVal:     []types.Causet{ran.HighVal[rangePosition]},
   548  				HighExclude: ran.HighExclude,
   549  			}
   550  			var count float64
   551  			var err error
   552  			colIDs := coll.Idx2DeferredCausetIDs[idxID]
   553  			var colID int64
   554  			if rangePosition >= len(colIDs) {
   555  				colID = -1
   556  			} else {
   557  				colID = colIDs[rangePosition]
   558  			}
   559  			// prefer index stats over column stats
   560  			if idx, ok := coll.DefCausID2IdxID[colID]; ok {
   561  				count, err = coll.GetRowCountByIndexRanges(sc, idx, []*ranger.Range{&rang})
   562  			} else {
   563  				count, err = coll.GetRowCountByDeferredCausetRanges(sc, colID, []*ranger.Range{&rang})
   564  			}
   565  			if err != nil {
   566  				return 0, errors.Trace(err)
   567  			}
   568  			selectivity = selectivity * count / float64(idx.TotalRowCount())
   569  		}
   570  		totalCount += selectivity * float64(idx.TotalRowCount())
   571  	}
   572  	if totalCount > idx.TotalRowCount() {
   573  		totalCount = idx.TotalRowCount()
   574  	}
   575  	return totalCount, nil
   576  }
   577  
   578  const fakePhysicalID int64 = -1
   579  
   580  // PseudoTable creates a pseudo causet statistics.
   581  func PseudoTable(tblInfo *perceptron.TableInfo) *Block {
   582  	pseudoHistDefCausl := HistDefCausl{
   583  		Count:           PseudoRowCount,
   584  		PhysicalID:      tblInfo.ID,
   585  		HavePhysicalID:  true,
   586  		DeferredCausets: make(map[int64]*DeferredCauset, len(tblInfo.DeferredCausets)),
   587  		Indices:         make(map[int64]*Index, len(tblInfo.Indices)),
   588  		Pseudo:          true,
   589  	}
   590  	t := &Block{
   591  		HistDefCausl: pseudoHistDefCausl,
   592  	}
   593  	for _, col := range tblInfo.DeferredCausets {
   594  		if col.State == perceptron.StatePublic {
   595  			t.DeferredCausets[col.ID] = &DeferredCauset{
   596  				PhysicalID: fakePhysicalID,
   597  				Info:       col,
   598  				IsHandle:   tblInfo.PKIsHandle && allegrosql.HasPriKeyFlag(col.Flag),
   599  				Histogram:  *NewHistogram(col.ID, 0, 0, 0, &col.FieldType, 0, 0),
   600  			}
   601  		}
   602  	}
   603  	for _, idx := range tblInfo.Indices {
   604  		if idx.State == perceptron.StatePublic {
   605  			t.Indices[idx.ID] = &Index{
   606  				Info:      idx,
   607  				Histogram: *NewHistogram(idx.ID, 0, 0, 0, types.NewFieldType(allegrosql.TypeBlob), 0, 0)}
   608  		}
   609  	}
   610  	return t
   611  }
   612  
   613  func getPseudoRowCountByIndexRanges(sc *stmtctx.StatementContext, indexRanges []*ranger.Range,
   614  	blockRowCount float64, defcausLen int) (float64, error) {
   615  	if blockRowCount == 0 {
   616  		return 0, nil
   617  	}
   618  	var totalCount float64
   619  	for _, indexRange := range indexRanges {
   620  		count := blockRowCount
   621  		i, err := indexRange.PrefixEqualLen(sc)
   622  		if err != nil {
   623  			return 0, errors.Trace(err)
   624  		}
   625  		if i == defcausLen && !indexRange.LowExclude && !indexRange.HighExclude {
   626  			totalCount += 1.0
   627  			continue
   628  		}
   629  		if i >= len(indexRange.LowVal) {
   630  			i = len(indexRange.LowVal) - 1
   631  		}
   632  		rowCount, err := GetPseudoRowCountByDeferredCausetRanges(sc, blockRowCount, []*ranger.Range{indexRange}, i)
   633  		if err != nil {
   634  			return 0, errors.Trace(err)
   635  		}
   636  		count = count / blockRowCount * rowCount
   637  		// If the condition is a = 1, b = 1, c = 1, d = 1, we think every a=1, b=1, c=1 only filtrate 1/100 data,
   638  		// so as to avoid collapsing too fast.
   639  		for j := 0; j < i; j++ {
   640  			count = count / float64(100)
   641  		}
   642  		totalCount += count
   643  	}
   644  	if totalCount > blockRowCount {
   645  		totalCount = blockRowCount / 3.0
   646  	}
   647  	return totalCount, nil
   648  }
   649  
   650  // GetPseudoRowCountByDeferredCausetRanges calculate the event count by the ranges if there's no statistics information for this column.
   651  func GetPseudoRowCountByDeferredCausetRanges(sc *stmtctx.StatementContext, blockRowCount float64, columnRanges []*ranger.Range, colIdx int) (float64, error) {
   652  	var rowCount float64
   653  	var err error
   654  	for _, ran := range columnRanges {
   655  		if ran.LowVal[colIdx].HoTT() == types.HoTTNull && ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue {
   656  			rowCount += blockRowCount
   657  		} else if ran.LowVal[colIdx].HoTT() == types.HoTTMinNotNull {
   658  			nullCount := blockRowCount / pseudoEqualRate
   659  			if ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue {
   660  				rowCount += blockRowCount - nullCount
   661  			} else if err == nil {
   662  				lessCount := blockRowCount / pseudoLessRate
   663  				rowCount += lessCount - nullCount
   664  			}
   665  		} else if ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue {
   666  			rowCount += blockRowCount / pseudoLessRate
   667  		} else {
   668  			compare, err1 := ran.LowVal[colIdx].CompareCauset(sc, &ran.HighVal[colIdx])
   669  			if err1 != nil {
   670  				return 0, errors.Trace(err1)
   671  			}
   672  			if compare == 0 {
   673  				rowCount += blockRowCount / pseudoEqualRate
   674  			} else {
   675  				rowCount += blockRowCount / pseudoBetweenRate
   676  			}
   677  		}
   678  		if err != nil {
   679  			return 0, errors.Trace(err)
   680  		}
   681  	}
   682  	if rowCount > blockRowCount {
   683  		rowCount = blockRowCount
   684  	}
   685  	return rowCount, nil
   686  }
   687  
   688  func getPseudoRowCountBySignedIntRanges(intRanges []*ranger.Range, blockRowCount float64) float64 {
   689  	var rowCount float64
   690  	for _, rg := range intRanges {
   691  		var cnt float64
   692  		low := rg.LowVal[0].GetInt64()
   693  		if rg.LowVal[0].HoTT() == types.HoTTNull || rg.LowVal[0].HoTT() == types.HoTTMinNotNull {
   694  			low = math.MinInt64
   695  		}
   696  		high := rg.HighVal[0].GetInt64()
   697  		if rg.HighVal[0].HoTT() == types.HoTTMaxValue {
   698  			high = math.MaxInt64
   699  		}
   700  		if low == math.MinInt64 && high == math.MaxInt64 {
   701  			cnt = blockRowCount
   702  		} else if low == math.MinInt64 {
   703  			cnt = blockRowCount / pseudoLessRate
   704  		} else if high == math.MaxInt64 {
   705  			cnt = blockRowCount / pseudoLessRate
   706  		} else {
   707  			if low == high {
   708  				cnt = 1 // When primary key is handle, the equal event count is at most one.
   709  			} else {
   710  				cnt = blockRowCount / pseudoBetweenRate
   711  			}
   712  		}
   713  		if high-low > 0 && cnt > float64(high-low) {
   714  			cnt = float64(high - low)
   715  		}
   716  		rowCount += cnt
   717  	}
   718  	if rowCount > blockRowCount {
   719  		rowCount = blockRowCount
   720  	}
   721  	return rowCount
   722  }
   723  
   724  func getPseudoRowCountByUnsignedIntRanges(intRanges []*ranger.Range, blockRowCount float64) float64 {
   725  	var rowCount float64
   726  	for _, rg := range intRanges {
   727  		var cnt float64
   728  		low := rg.LowVal[0].GetUint64()
   729  		if rg.LowVal[0].HoTT() == types.HoTTNull || rg.LowVal[0].HoTT() == types.HoTTMinNotNull {
   730  			low = 0
   731  		}
   732  		high := rg.HighVal[0].GetUint64()
   733  		if rg.HighVal[0].HoTT() == types.HoTTMaxValue {
   734  			high = math.MaxUint64
   735  		}
   736  		if low == 0 && high == math.MaxUint64 {
   737  			cnt = blockRowCount
   738  		} else if low == 0 {
   739  			cnt = blockRowCount / pseudoLessRate
   740  		} else if high == math.MaxUint64 {
   741  			cnt = blockRowCount / pseudoLessRate
   742  		} else {
   743  			if low == high {
   744  				cnt = 1 // When primary key is handle, the equal event count is at most one.
   745  			} else {
   746  				cnt = blockRowCount / pseudoBetweenRate
   747  			}
   748  		}
   749  		if high > low && cnt > float64(high-low) {
   750  			cnt = float64(high - low)
   751  		}
   752  		rowCount += cnt
   753  	}
   754  	if rowCount > blockRowCount {
   755  		rowCount = blockRowCount
   756  	}
   757  	return rowCount
   758  }
   759  
   760  // GetAvgRowSize computes average event size for given columns.
   761  func (coll *HistDefCausl) GetAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, isEncodedKey bool, isForScan bool) (size float64) {
   762  	stochastikVars := ctx.GetStochastikVars()
   763  	if coll.Pseudo || len(coll.DeferredCausets) == 0 || coll.Count == 0 {
   764  		size = pseudoDefCausSize * float64(len(defcaus))
   765  	} else {
   766  		for _, col := range defcaus {
   767  			colHist, ok := coll.DeferredCausets[col.UniqueID]
   768  			// Normally this would not happen, it is for compatibility with old version stats which
   769  			// does not include TotDefCausSize.
   770  			if !ok || (!colHist.IsHandle && colHist.TotDefCausSize == 0 && (colHist.NullCount != coll.Count)) {
   771  				size += pseudoDefCausSize
   772  				continue
   773  			}
   774  			// We differentiate if the column is encoded as key or value, because the resulted size
   775  			// is different.
   776  			if stochastikVars.EnableChunkRPC && !isForScan {
   777  				size += colHist.AvgDefCausSizeChunkFormat(coll.Count)
   778  			} else {
   779  				size += colHist.AvgDefCausSize(coll.Count, isEncodedKey)
   780  			}
   781  		}
   782  	}
   783  	if stochastikVars.EnableChunkRPC && !isForScan {
   784  		// Add 1/8 byte for each column's nullBitMap byte.
   785  		return size + float64(len(defcaus))/8
   786  	}
   787  	// Add 1 byte for each column's flag byte. See `encode` for details.
   788  	return size + float64(len(defcaus))
   789  }
   790  
   791  // GetAvgRowSizeListInDisk computes average event size for given columns.
   792  func (coll *HistDefCausl) GetAvgRowSizeListInDisk(defcaus []*memex.DeferredCauset) (size float64) {
   793  	if coll.Pseudo || len(coll.DeferredCausets) == 0 || coll.Count == 0 {
   794  		for _, col := range defcaus {
   795  			size += float64(chunk.EstimateTypeWidth(col.GetType()))
   796  		}
   797  	} else {
   798  		for _, col := range defcaus {
   799  			colHist, ok := coll.DeferredCausets[col.UniqueID]
   800  			// Normally this would not happen, it is for compatibility with old version stats which
   801  			// does not include TotDefCausSize.
   802  			if !ok || (!colHist.IsHandle && colHist.TotDefCausSize == 0 && (colHist.NullCount != coll.Count)) {
   803  				size += float64(chunk.EstimateTypeWidth(col.GetType()))
   804  				continue
   805  			}
   806  			size += colHist.AvgDefCausSizeListInDisk(coll.Count)
   807  		}
   808  	}
   809  	// Add 8 byte for each column's size record. See `ListInDisk` for details.
   810  	return size + float64(8*len(defcaus))
   811  }
   812  
   813  // GetTableAvgRowSize computes average event size for a causet scan, exclude the index key-value pairs.
   814  func (coll *HistDefCausl) GetTableAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, storeType ekv.StoreType, handleInDefCauss bool) (size float64) {
   815  	size = coll.GetAvgRowSize(ctx, defcaus, false, true)
   816  	switch storeType {
   817  	case ekv.EinsteinDB:
   818  		size += blockcodec.RecordRowKeyLen
   819  		// The `defcaus` for EinsteinDB always contain the row_id, so prefix event size subtract its length.
   820  		size -= 8
   821  	case ekv.TiFlash:
   822  		if !handleInDefCauss {
   823  			size += 8 /* row_id length */
   824  		}
   825  	}
   826  	return
   827  }
   828  
   829  // GetIndexAvgRowSize computes average event size for a index scan.
   830  func (coll *HistDefCausl) GetIndexAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, isUnique bool) (size float64) {
   831  	size = coll.GetAvgRowSize(ctx, defcaus, true, true)
   832  	// blockPrefix(1) + blockID(8) + indexPrefix(2) + indexID(8)
   833  	// Because the defcaus for index scan always contain the handle, so we don't add the rowID here.
   834  	size += 19
   835  	if !isUnique {
   836  		// add the len("_")
   837  		size++
   838  	}
   839  	return
   840  }