github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/selectivity.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"math"
    18  	"math/bits"
    19  	"sort"
    20  
    21  	"github.com/whtcorpsinc/errors"
    22  	"github.com/whtcorpsinc/BerolinaSQL/ast"
    23  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    24  	"github.com/whtcorpsinc/milevadb/memex"
    25  	planutil "github.com/whtcorpsinc/milevadb/causet/soliton"
    26  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    27  	"github.com/whtcorpsinc/milevadb/types"
    28  	"github.com/whtcorpsinc/milevadb/soliton/logutil"
    29  	"github.com/whtcorpsinc/milevadb/soliton/ranger"
    30  	"go.uber.org/zap"
    31  )
    32  
    33  // If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8.
    34  const selectionFactor = 0.8
    35  
    36  // StatsNode is used for calculating selectivity.
    37  type StatsNode struct {
    38  	Tp int
    39  	ID int64
    40  	// mask is a bit pattern whose ith bit will indicate whether the ith memex is covered by this index/column.
    41  	mask int64
    42  	// Ranges contains all the Ranges we got.
    43  	Ranges []*ranger.Range
    44  	// Selectivity indicates the Selectivity of this column/index.
    45  	Selectivity float64
    46  	// numDefCauss is the number of columns contained in the index or column(which is always 1).
    47  	numDefCauss int
    48  	// partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true
    49  	// when the condition is a DNF memex on index, and the memex is not totally extracted as access condition.
    50  	partCover bool
    51  }
    52  
    53  // The type of the StatsNode.
    54  const (
    55  	IndexType = iota
    56  	PkType
    57  	DefCausType
    58  )
    59  
    60  func compareType(l, r int) int {
    61  	if l == r {
    62  		return 0
    63  	}
    64  	if l == DefCausType {
    65  		return -1
    66  	}
    67  	if l == PkType {
    68  		return 1
    69  	}
    70  	if r == DefCausType {
    71  		return 1
    72  	}
    73  	return -1
    74  }
    75  
    76  // MockStatsNode is only used for test.
    77  func MockStatsNode(id int64, m int64, num int) *StatsNode {
    78  	return &StatsNode{ID: id, mask: m, numDefCauss: num}
    79  }
    80  
    81  const unknownDeferredCausetID = math.MinInt64
    82  
    83  // getConstantDeferredCausetID receives two memexs and if one of them is column and another is constant, it returns the
    84  // ID of the column.
    85  func getConstantDeferredCausetID(e []memex.Expression) int64 {
    86  	if len(e) != 2 {
    87  		return unknownDeferredCausetID
    88  	}
    89  	col, ok1 := e[0].(*memex.DeferredCauset)
    90  	_, ok2 := e[1].(*memex.Constant)
    91  	if ok1 && ok2 {
    92  		return col.ID
    93  	}
    94  	col, ok1 = e[1].(*memex.DeferredCauset)
    95  	_, ok2 = e[0].(*memex.Constant)
    96  	if ok1 && ok2 {
    97  		return col.ID
    98  	}
    99  	return unknownDeferredCausetID
   100  }
   101  
   102  func pseudoSelectivity(coll *HistDefCausl, exprs []memex.Expression) float64 {
   103  	minFactor := selectionFactor
   104  	colExists := make(map[string]bool)
   105  	for _, expr := range exprs {
   106  		fun, ok := expr.(*memex.ScalarFunction)
   107  		if !ok {
   108  			continue
   109  		}
   110  		colID := getConstantDeferredCausetID(fun.GetArgs())
   111  		if colID == unknownDeferredCausetID {
   112  			continue
   113  		}
   114  		switch fun.FuncName.L {
   115  		case ast.EQ, ast.NullEQ, ast.In:
   116  			minFactor = math.Min(minFactor, 1.0/pseudoEqualRate)
   117  			col, ok := coll.DeferredCausets[colID]
   118  			if !ok {
   119  				continue
   120  			}
   121  			colExists[col.Info.Name.L] = true
   122  			if allegrosql.HasUniKeyFlag(col.Info.Flag) {
   123  				return 1.0 / float64(coll.Count)
   124  			}
   125  		case ast.GE, ast.GT, ast.LE, ast.LT:
   126  			minFactor = math.Min(minFactor, 1.0/pseudoLessRate)
   127  			// FIXME: To resolve the between case.
   128  		}
   129  	}
   130  	if len(colExists) == 0 {
   131  		return minFactor
   132  	}
   133  	// use the unique key info
   134  	for _, idx := range coll.Indices {
   135  		if !idx.Info.Unique {
   136  			continue
   137  		}
   138  		unique := true
   139  		for _, col := range idx.Info.DeferredCausets {
   140  			if !colExists[col.Name.L] {
   141  				unique = false
   142  				break
   143  			}
   144  		}
   145  		if unique {
   146  			return 1.0 / float64(coll.Count)
   147  		}
   148  	}
   149  	return minFactor
   150  }
   151  
   152  // isDefCausEqCorDefCaus checks if the memex is a eq function that one side is correlated column and another is column.
   153  // If so, it will return the column's reference. Otherwise return nil instead.
   154  func isDefCausEqCorDefCaus(filter memex.Expression) *memex.DeferredCauset {
   155  	f, ok := filter.(*memex.ScalarFunction)
   156  	if !ok || f.FuncName.L != ast.EQ {
   157  		return nil
   158  	}
   159  	if c, ok := f.GetArgs()[0].(*memex.DeferredCauset); ok {
   160  		if _, ok := f.GetArgs()[1].(*memex.CorrelatedDeferredCauset); ok {
   161  			return c
   162  		}
   163  	}
   164  	if c, ok := f.GetArgs()[1].(*memex.DeferredCauset); ok {
   165  		if _, ok := f.GetArgs()[0].(*memex.CorrelatedDeferredCauset); ok {
   166  			return c
   167  		}
   168  	}
   169  	return nil
   170  }
   171  
   172  // Selectivity is a function calculate the selectivity of the memexs.
   173  // The definition of selectivity is (event count after filter / event count before filter).
   174  // And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this.
   175  // Currently the time complexity is o(n^2).
   176  func (coll *HistDefCausl) Selectivity(ctx stochastikctx.Context, exprs []memex.Expression, filledPaths []*planutil.AccessPath) (float64, []*StatsNode, error) {
   177  	// If causet's count is zero or conditions are empty, we should return 100% selectivity.
   178  	if coll.Count == 0 || len(exprs) == 0 {
   179  		return 1, nil, nil
   180  	}
   181  	// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64.
   182  	// This will simplify some code and speed up if we use this rather than a boolean slice.
   183  	if len(exprs) > 63 || (len(coll.DeferredCausets) == 0 && len(coll.Indices) == 0) {
   184  		return pseudoSelectivity(coll, exprs), nil, nil
   185  	}
   186  	ret := 1.0
   187  	var nodes []*StatsNode
   188  	sc := ctx.GetStochastikVars().StmtCtx
   189  
   190  	remainedExprs := make([]memex.Expression, 0, len(exprs))
   191  
   192  	// Deal with the correlated column.
   193  	for _, expr := range exprs {
   194  		c := isDefCausEqCorDefCaus(expr)
   195  		if c == nil {
   196  			remainedExprs = append(remainedExprs, expr)
   197  			continue
   198  		}
   199  
   200  		if colHist := coll.DeferredCausets[c.UniqueID]; colHist == nil || colHist.IsInvalid(sc, coll.Pseudo) {
   201  			ret *= 1.0 / pseudoEqualRate
   202  			continue
   203  		}
   204  
   205  		colHist := coll.DeferredCausets[c.UniqueID]
   206  		if colHist.NDV > 0 {
   207  			ret *= 1 / float64(colHist.NDV)
   208  		} else {
   209  			ret *= 1.0 / pseudoEqualRate
   210  		}
   211  	}
   212  
   213  	extractedDefCauss := make([]*memex.DeferredCauset, 0, len(coll.DeferredCausets))
   214  	extractedDefCauss = memex.ExtractDeferredCausetsFromExpressions(extractedDefCauss, remainedExprs, nil)
   215  	for id, colInfo := range coll.DeferredCausets {
   216  		col := memex.DefCausInfo2DefCaus(extractedDefCauss, colInfo.Info)
   217  		if col != nil {
   218  			maskCovered, ranges, _, err := getMaskAndRanges(ctx, remainedExprs, ranger.DeferredCausetRangeType, nil, nil, col)
   219  			if err != nil {
   220  				return 0, nil, errors.Trace(err)
   221  			}
   222  			nodes = append(nodes, &StatsNode{Tp: DefCausType, ID: id, mask: maskCovered, Ranges: ranges, numDefCauss: 1})
   223  			if colInfo.IsHandle {
   224  				nodes[len(nodes)-1].Tp = PkType
   225  				var cnt float64
   226  				cnt, err = coll.GetRowCountByIntDeferredCausetRanges(sc, id, ranges)
   227  				if err != nil {
   228  					return 0, nil, errors.Trace(err)
   229  				}
   230  				nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count)
   231  				continue
   232  			}
   233  			cnt, err := coll.GetRowCountByDeferredCausetRanges(sc, id, ranges)
   234  			if err != nil {
   235  				return 0, nil, errors.Trace(err)
   236  			}
   237  			nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count)
   238  		}
   239  	}
   240  	id2Paths := make(map[int64]*planutil.AccessPath)
   241  	for _, path := range filledPaths {
   242  		if path.IsTablePath() {
   243  			continue
   244  		}
   245  		id2Paths[path.Index.ID] = path
   246  	}
   247  	for id, idxInfo := range coll.Indices {
   248  		idxDefCauss := memex.FindPrefixOfIndex(extractedDefCauss, coll.Idx2DeferredCausetIDs[id])
   249  		if len(idxDefCauss) > 0 {
   250  			lengths := make([]int, 0, len(idxDefCauss))
   251  			for i := 0; i < len(idxDefCauss); i++ {
   252  				lengths = append(lengths, idxInfo.Info.DeferredCausets[i].Length)
   253  			}
   254  			maskCovered, ranges, partCover, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, id2Paths[idxInfo.ID], idxDefCauss...)
   255  			if err != nil {
   256  				return 0, nil, errors.Trace(err)
   257  			}
   258  			cnt, err := coll.GetRowCountByIndexRanges(sc, id, ranges)
   259  			if err != nil {
   260  				return 0, nil, errors.Trace(err)
   261  			}
   262  			selectivity := cnt / float64(coll.Count)
   263  			nodes = append(nodes, &StatsNode{
   264  				Tp:          IndexType,
   265  				ID:          id,
   266  				mask:        maskCovered,
   267  				Ranges:      ranges,
   268  				numDefCauss:     len(idxInfo.Info.DeferredCausets),
   269  				Selectivity: selectivity,
   270  				partCover:   partCover,
   271  			})
   272  		}
   273  	}
   274  	usedSets := GetUsableSetsByGreedy(nodes)
   275  	// Initialize the mask with the full set.
   276  	mask := (int64(1) << uint(len(remainedExprs))) - 1
   277  	for _, set := range usedSets {
   278  		mask &^= set.mask
   279  		ret *= set.Selectivity
   280  		// If `partCover` is true, it means that the conditions are in DNF form, and only part
   281  		// of the DNF memexs are extracted as access conditions, so besides from the selectivity
   282  		// of the extracted access conditions, we multiply another selectionFactor for the residual
   283  		// conditions.
   284  		if set.partCover {
   285  			ret *= selectionFactor
   286  		}
   287  	}
   288  
   289  	// Now we try to cover those still not covered DNF conditions using independence assumption,
   290  	// i.e., sel(condA or condB) = sel(condA) + sel(condB) - sel(condA) * sel(condB)
   291  	if mask > 0 {
   292  		for i, expr := range remainedExprs {
   293  			if mask&(1<<uint64(i)) == 0 {
   294  				continue
   295  			}
   296  			scalarCond, ok := expr.(*memex.ScalarFunction)
   297  			// Make sure we only handle DNF condition.
   298  			if !ok || scalarCond.FuncName.L != ast.LogicOr {
   299  				continue
   300  			}
   301  			dnfItems := memex.FlattenDNFConditions(scalarCond)
   302  			dnfItems = ranger.MergeDNFItems4DefCaus(ctx, dnfItems)
   303  
   304  			selectivity := 0.0
   305  			for _, cond := range dnfItems {
   306  				// In selectivity calculation, we don't handle CorrelatedDeferredCauset, so we directly skip over it.
   307  				// Other HoTTs of `Expression`, i.e., Constant, DeferredCauset and ScalarFunction all can possibly be built into
   308  				// ranges and used to calculation selectivity, so we accept them all.
   309  				_, ok := cond.(*memex.CorrelatedDeferredCauset)
   310  				if ok {
   311  					continue
   312  				}
   313  
   314  				var cnfItems []memex.Expression
   315  				if scalar, ok := cond.(*memex.ScalarFunction); ok && scalar.FuncName.L == ast.LogicAnd {
   316  					cnfItems = memex.FlattenCNFConditions(scalar)
   317  				} else {
   318  					cnfItems = append(cnfItems, cond)
   319  				}
   320  
   321  				curSelectivity, _, err := coll.Selectivity(ctx, cnfItems, nil)
   322  				if err != nil {
   323  					logutil.BgLogger().Debug("something wrong happened, use the default selectivity", zap.Error(err))
   324  					selectivity = selectionFactor
   325  				}
   326  
   327  				selectivity = selectivity + curSelectivity - selectivity*curSelectivity
   328  			}
   329  
   330  			if selectivity != 0 {
   331  				ret *= selectivity
   332  				mask &^= 1 << uint64(i)
   333  			}
   334  		}
   335  	}
   336  
   337  	// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
   338  	if mask > 0 {
   339  		ret *= selectionFactor
   340  	}
   341  	return ret, nodes, nil
   342  }
   343  
   344  func getMaskAndRanges(ctx stochastikctx.Context, exprs []memex.Expression, rangeType ranger.RangeType, lengths []int, cachedPath *planutil.AccessPath, defcaus ...*memex.DeferredCauset) (mask int64, ranges []*ranger.Range, partCover bool, err error) {
   345  	sc := ctx.GetStochastikVars().StmtCtx
   346  	isDNF := false
   347  	var accessConds, remainedConds []memex.Expression
   348  	switch rangeType {
   349  	case ranger.DeferredCausetRangeType:
   350  		accessConds = ranger.ExtractAccessConditionsForDeferredCauset(exprs, defcaus[0].UniqueID)
   351  		ranges, err = ranger.BuildDeferredCausetRange(accessConds, sc, defcaus[0].RetType, types.UnspecifiedLength)
   352  	case ranger.IndexRangeType:
   353  		if cachedPath != nil {
   354  			ranges, accessConds, remainedConds, isDNF = cachedPath.Ranges, cachedPath.AccessConds, cachedPath.TableFilters, cachedPath.IsDNFCond
   355  			break
   356  		}
   357  		var res *ranger.DetachRangeResult
   358  		res, err = ranger.DetachCondAndBuildRangeForIndex(ctx, exprs, defcaus, lengths)
   359  		ranges, accessConds, remainedConds, isDNF = res.Ranges, res.AccessConds, res.RemainedConds, res.IsDNFCond
   360  		if err != nil {
   361  			return 0, nil, false, err
   362  		}
   363  	default:
   364  		panic("should never be here")
   365  	}
   366  	if err != nil {
   367  		return 0, nil, false, err
   368  	}
   369  	if isDNF && len(accessConds) > 0 {
   370  		mask |= 1
   371  		return mask, ranges, len(remainedConds) > 0, nil
   372  	}
   373  	for i := range exprs {
   374  		for j := range accessConds {
   375  			if exprs[i].Equal(ctx, accessConds[j]) {
   376  				mask |= 1 << uint64(i)
   377  				break
   378  			}
   379  		}
   380  	}
   381  	return mask, ranges, false, nil
   382  }
   383  
   384  // GetUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm.
   385  func GetUsableSetsByGreedy(nodes []*StatsNode) (newBlocks []*StatsNode) {
   386  	sort.Slice(nodes, func(i int, j int) bool {
   387  		if r := compareType(nodes[i].Tp, nodes[j].Tp); r != 0 {
   388  			return r < 0
   389  		}
   390  		return nodes[i].ID < nodes[j].ID
   391  	})
   392  	marked := make([]bool, len(nodes))
   393  	mask := int64(math.MaxInt64)
   394  	for {
   395  		// Choose the index that covers most.
   396  		bestID, bestCount, bestTp, bestNumDefCauss, bestMask := -1, 0, DefCausType, 0, int64(0)
   397  		for i, set := range nodes {
   398  			if marked[i] {
   399  				continue
   400  			}
   401  			curMask := set.mask & mask
   402  			if curMask != set.mask {
   403  				marked[i] = true
   404  				continue
   405  			}
   406  			bits := bits.OnesCount64(uint64(curMask))
   407  			// This set cannot cover any thing, just skip it.
   408  			if bits == 0 {
   409  				marked[i] = true
   410  				continue
   411  			}
   412  			// We greedy select the stats info based on:
   413  			// (1): The stats type, always prefer the primary key or index.
   414  			// (2): The number of memex that it covers, the more the better.
   415  			// (3): The number of columns that it contains, the less the better.
   416  			if (bestTp == DefCausType && set.Tp != DefCausType) || bestCount < bits || (bestCount == bits && bestNumDefCauss > set.numDefCauss) {
   417  				bestID, bestCount, bestTp, bestNumDefCauss, bestMask = i, bits, set.Tp, set.numDefCauss, curMask
   418  			}
   419  		}
   420  		if bestCount == 0 {
   421  			break
   422  		}
   423  
   424  		// UFIDelate the mask, remove the bit that nodes[bestID].mask has.
   425  		mask &^= bestMask
   426  
   427  		newBlocks = append(newBlocks, nodes[bestID])
   428  		marked[bestID] = true
   429  	}
   430  	return
   431  }