github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/selectivity.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "math" 18 "math/bits" 19 "sort" 20 21 "github.com/whtcorpsinc/errors" 22 "github.com/whtcorpsinc/BerolinaSQL/ast" 23 "github.com/whtcorpsinc/BerolinaSQL/allegrosql" 24 "github.com/whtcorpsinc/milevadb/memex" 25 planutil "github.com/whtcorpsinc/milevadb/causet/soliton" 26 "github.com/whtcorpsinc/milevadb/stochastikctx" 27 "github.com/whtcorpsinc/milevadb/types" 28 "github.com/whtcorpsinc/milevadb/soliton/logutil" 29 "github.com/whtcorpsinc/milevadb/soliton/ranger" 30 "go.uber.org/zap" 31 ) 32 33 // If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8. 34 const selectionFactor = 0.8 35 36 // StatsNode is used for calculating selectivity. 37 type StatsNode struct { 38 Tp int 39 ID int64 40 // mask is a bit pattern whose ith bit will indicate whether the ith memex is covered by this index/column. 41 mask int64 42 // Ranges contains all the Ranges we got. 43 Ranges []*ranger.Range 44 // Selectivity indicates the Selectivity of this column/index. 45 Selectivity float64 46 // numDefCauss is the number of columns contained in the index or column(which is always 1). 47 numDefCauss int 48 // partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true 49 // when the condition is a DNF memex on index, and the memex is not totally extracted as access condition. 50 partCover bool 51 } 52 53 // The type of the StatsNode. 54 const ( 55 IndexType = iota 56 PkType 57 DefCausType 58 ) 59 60 func compareType(l, r int) int { 61 if l == r { 62 return 0 63 } 64 if l == DefCausType { 65 return -1 66 } 67 if l == PkType { 68 return 1 69 } 70 if r == DefCausType { 71 return 1 72 } 73 return -1 74 } 75 76 // MockStatsNode is only used for test. 77 func MockStatsNode(id int64, m int64, num int) *StatsNode { 78 return &StatsNode{ID: id, mask: m, numDefCauss: num} 79 } 80 81 const unknownDeferredCausetID = math.MinInt64 82 83 // getConstantDeferredCausetID receives two memexs and if one of them is column and another is constant, it returns the 84 // ID of the column. 85 func getConstantDeferredCausetID(e []memex.Expression) int64 { 86 if len(e) != 2 { 87 return unknownDeferredCausetID 88 } 89 col, ok1 := e[0].(*memex.DeferredCauset) 90 _, ok2 := e[1].(*memex.Constant) 91 if ok1 && ok2 { 92 return col.ID 93 } 94 col, ok1 = e[1].(*memex.DeferredCauset) 95 _, ok2 = e[0].(*memex.Constant) 96 if ok1 && ok2 { 97 return col.ID 98 } 99 return unknownDeferredCausetID 100 } 101 102 func pseudoSelectivity(coll *HistDefCausl, exprs []memex.Expression) float64 { 103 minFactor := selectionFactor 104 colExists := make(map[string]bool) 105 for _, expr := range exprs { 106 fun, ok := expr.(*memex.ScalarFunction) 107 if !ok { 108 continue 109 } 110 colID := getConstantDeferredCausetID(fun.GetArgs()) 111 if colID == unknownDeferredCausetID { 112 continue 113 } 114 switch fun.FuncName.L { 115 case ast.EQ, ast.NullEQ, ast.In: 116 minFactor = math.Min(minFactor, 1.0/pseudoEqualRate) 117 col, ok := coll.DeferredCausets[colID] 118 if !ok { 119 continue 120 } 121 colExists[col.Info.Name.L] = true 122 if allegrosql.HasUniKeyFlag(col.Info.Flag) { 123 return 1.0 / float64(coll.Count) 124 } 125 case ast.GE, ast.GT, ast.LE, ast.LT: 126 minFactor = math.Min(minFactor, 1.0/pseudoLessRate) 127 // FIXME: To resolve the between case. 128 } 129 } 130 if len(colExists) == 0 { 131 return minFactor 132 } 133 // use the unique key info 134 for _, idx := range coll.Indices { 135 if !idx.Info.Unique { 136 continue 137 } 138 unique := true 139 for _, col := range idx.Info.DeferredCausets { 140 if !colExists[col.Name.L] { 141 unique = false 142 break 143 } 144 } 145 if unique { 146 return 1.0 / float64(coll.Count) 147 } 148 } 149 return minFactor 150 } 151 152 // isDefCausEqCorDefCaus checks if the memex is a eq function that one side is correlated column and another is column. 153 // If so, it will return the column's reference. Otherwise return nil instead. 154 func isDefCausEqCorDefCaus(filter memex.Expression) *memex.DeferredCauset { 155 f, ok := filter.(*memex.ScalarFunction) 156 if !ok || f.FuncName.L != ast.EQ { 157 return nil 158 } 159 if c, ok := f.GetArgs()[0].(*memex.DeferredCauset); ok { 160 if _, ok := f.GetArgs()[1].(*memex.CorrelatedDeferredCauset); ok { 161 return c 162 } 163 } 164 if c, ok := f.GetArgs()[1].(*memex.DeferredCauset); ok { 165 if _, ok := f.GetArgs()[0].(*memex.CorrelatedDeferredCauset); ok { 166 return c 167 } 168 } 169 return nil 170 } 171 172 // Selectivity is a function calculate the selectivity of the memexs. 173 // The definition of selectivity is (event count after filter / event count before filter). 174 // And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this. 175 // Currently the time complexity is o(n^2). 176 func (coll *HistDefCausl) Selectivity(ctx stochastikctx.Context, exprs []memex.Expression, filledPaths []*planutil.AccessPath) (float64, []*StatsNode, error) { 177 // If causet's count is zero or conditions are empty, we should return 100% selectivity. 178 if coll.Count == 0 || len(exprs) == 0 { 179 return 1, nil, nil 180 } 181 // TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64. 182 // This will simplify some code and speed up if we use this rather than a boolean slice. 183 if len(exprs) > 63 || (len(coll.DeferredCausets) == 0 && len(coll.Indices) == 0) { 184 return pseudoSelectivity(coll, exprs), nil, nil 185 } 186 ret := 1.0 187 var nodes []*StatsNode 188 sc := ctx.GetStochastikVars().StmtCtx 189 190 remainedExprs := make([]memex.Expression, 0, len(exprs)) 191 192 // Deal with the correlated column. 193 for _, expr := range exprs { 194 c := isDefCausEqCorDefCaus(expr) 195 if c == nil { 196 remainedExprs = append(remainedExprs, expr) 197 continue 198 } 199 200 if colHist := coll.DeferredCausets[c.UniqueID]; colHist == nil || colHist.IsInvalid(sc, coll.Pseudo) { 201 ret *= 1.0 / pseudoEqualRate 202 continue 203 } 204 205 colHist := coll.DeferredCausets[c.UniqueID] 206 if colHist.NDV > 0 { 207 ret *= 1 / float64(colHist.NDV) 208 } else { 209 ret *= 1.0 / pseudoEqualRate 210 } 211 } 212 213 extractedDefCauss := make([]*memex.DeferredCauset, 0, len(coll.DeferredCausets)) 214 extractedDefCauss = memex.ExtractDeferredCausetsFromExpressions(extractedDefCauss, remainedExprs, nil) 215 for id, colInfo := range coll.DeferredCausets { 216 col := memex.DefCausInfo2DefCaus(extractedDefCauss, colInfo.Info) 217 if col != nil { 218 maskCovered, ranges, _, err := getMaskAndRanges(ctx, remainedExprs, ranger.DeferredCausetRangeType, nil, nil, col) 219 if err != nil { 220 return 0, nil, errors.Trace(err) 221 } 222 nodes = append(nodes, &StatsNode{Tp: DefCausType, ID: id, mask: maskCovered, Ranges: ranges, numDefCauss: 1}) 223 if colInfo.IsHandle { 224 nodes[len(nodes)-1].Tp = PkType 225 var cnt float64 226 cnt, err = coll.GetRowCountByIntDeferredCausetRanges(sc, id, ranges) 227 if err != nil { 228 return 0, nil, errors.Trace(err) 229 } 230 nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count) 231 continue 232 } 233 cnt, err := coll.GetRowCountByDeferredCausetRanges(sc, id, ranges) 234 if err != nil { 235 return 0, nil, errors.Trace(err) 236 } 237 nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count) 238 } 239 } 240 id2Paths := make(map[int64]*planutil.AccessPath) 241 for _, path := range filledPaths { 242 if path.IsTablePath() { 243 continue 244 } 245 id2Paths[path.Index.ID] = path 246 } 247 for id, idxInfo := range coll.Indices { 248 idxDefCauss := memex.FindPrefixOfIndex(extractedDefCauss, coll.Idx2DeferredCausetIDs[id]) 249 if len(idxDefCauss) > 0 { 250 lengths := make([]int, 0, len(idxDefCauss)) 251 for i := 0; i < len(idxDefCauss); i++ { 252 lengths = append(lengths, idxInfo.Info.DeferredCausets[i].Length) 253 } 254 maskCovered, ranges, partCover, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, id2Paths[idxInfo.ID], idxDefCauss...) 255 if err != nil { 256 return 0, nil, errors.Trace(err) 257 } 258 cnt, err := coll.GetRowCountByIndexRanges(sc, id, ranges) 259 if err != nil { 260 return 0, nil, errors.Trace(err) 261 } 262 selectivity := cnt / float64(coll.Count) 263 nodes = append(nodes, &StatsNode{ 264 Tp: IndexType, 265 ID: id, 266 mask: maskCovered, 267 Ranges: ranges, 268 numDefCauss: len(idxInfo.Info.DeferredCausets), 269 Selectivity: selectivity, 270 partCover: partCover, 271 }) 272 } 273 } 274 usedSets := GetUsableSetsByGreedy(nodes) 275 // Initialize the mask with the full set. 276 mask := (int64(1) << uint(len(remainedExprs))) - 1 277 for _, set := range usedSets { 278 mask &^= set.mask 279 ret *= set.Selectivity 280 // If `partCover` is true, it means that the conditions are in DNF form, and only part 281 // of the DNF memexs are extracted as access conditions, so besides from the selectivity 282 // of the extracted access conditions, we multiply another selectionFactor for the residual 283 // conditions. 284 if set.partCover { 285 ret *= selectionFactor 286 } 287 } 288 289 // Now we try to cover those still not covered DNF conditions using independence assumption, 290 // i.e., sel(condA or condB) = sel(condA) + sel(condB) - sel(condA) * sel(condB) 291 if mask > 0 { 292 for i, expr := range remainedExprs { 293 if mask&(1<<uint64(i)) == 0 { 294 continue 295 } 296 scalarCond, ok := expr.(*memex.ScalarFunction) 297 // Make sure we only handle DNF condition. 298 if !ok || scalarCond.FuncName.L != ast.LogicOr { 299 continue 300 } 301 dnfItems := memex.FlattenDNFConditions(scalarCond) 302 dnfItems = ranger.MergeDNFItems4DefCaus(ctx, dnfItems) 303 304 selectivity := 0.0 305 for _, cond := range dnfItems { 306 // In selectivity calculation, we don't handle CorrelatedDeferredCauset, so we directly skip over it. 307 // Other HoTTs of `Expression`, i.e., Constant, DeferredCauset and ScalarFunction all can possibly be built into 308 // ranges and used to calculation selectivity, so we accept them all. 309 _, ok := cond.(*memex.CorrelatedDeferredCauset) 310 if ok { 311 continue 312 } 313 314 var cnfItems []memex.Expression 315 if scalar, ok := cond.(*memex.ScalarFunction); ok && scalar.FuncName.L == ast.LogicAnd { 316 cnfItems = memex.FlattenCNFConditions(scalar) 317 } else { 318 cnfItems = append(cnfItems, cond) 319 } 320 321 curSelectivity, _, err := coll.Selectivity(ctx, cnfItems, nil) 322 if err != nil { 323 logutil.BgLogger().Debug("something wrong happened, use the default selectivity", zap.Error(err)) 324 selectivity = selectionFactor 325 } 326 327 selectivity = selectivity + curSelectivity - selectivity*curSelectivity 328 } 329 330 if selectivity != 0 { 331 ret *= selectivity 332 mask &^= 1 << uint64(i) 333 } 334 } 335 } 336 337 // If there's still conditions which cannot be calculated, we will multiply a selectionFactor. 338 if mask > 0 { 339 ret *= selectionFactor 340 } 341 return ret, nodes, nil 342 } 343 344 func getMaskAndRanges(ctx stochastikctx.Context, exprs []memex.Expression, rangeType ranger.RangeType, lengths []int, cachedPath *planutil.AccessPath, defcaus ...*memex.DeferredCauset) (mask int64, ranges []*ranger.Range, partCover bool, err error) { 345 sc := ctx.GetStochastikVars().StmtCtx 346 isDNF := false 347 var accessConds, remainedConds []memex.Expression 348 switch rangeType { 349 case ranger.DeferredCausetRangeType: 350 accessConds = ranger.ExtractAccessConditionsForDeferredCauset(exprs, defcaus[0].UniqueID) 351 ranges, err = ranger.BuildDeferredCausetRange(accessConds, sc, defcaus[0].RetType, types.UnspecifiedLength) 352 case ranger.IndexRangeType: 353 if cachedPath != nil { 354 ranges, accessConds, remainedConds, isDNF = cachedPath.Ranges, cachedPath.AccessConds, cachedPath.TableFilters, cachedPath.IsDNFCond 355 break 356 } 357 var res *ranger.DetachRangeResult 358 res, err = ranger.DetachCondAndBuildRangeForIndex(ctx, exprs, defcaus, lengths) 359 ranges, accessConds, remainedConds, isDNF = res.Ranges, res.AccessConds, res.RemainedConds, res.IsDNFCond 360 if err != nil { 361 return 0, nil, false, err 362 } 363 default: 364 panic("should never be here") 365 } 366 if err != nil { 367 return 0, nil, false, err 368 } 369 if isDNF && len(accessConds) > 0 { 370 mask |= 1 371 return mask, ranges, len(remainedConds) > 0, nil 372 } 373 for i := range exprs { 374 for j := range accessConds { 375 if exprs[i].Equal(ctx, accessConds[j]) { 376 mask |= 1 << uint64(i) 377 break 378 } 379 } 380 } 381 return mask, ranges, false, nil 382 } 383 384 // GetUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm. 385 func GetUsableSetsByGreedy(nodes []*StatsNode) (newBlocks []*StatsNode) { 386 sort.Slice(nodes, func(i int, j int) bool { 387 if r := compareType(nodes[i].Tp, nodes[j].Tp); r != 0 { 388 return r < 0 389 } 390 return nodes[i].ID < nodes[j].ID 391 }) 392 marked := make([]bool, len(nodes)) 393 mask := int64(math.MaxInt64) 394 for { 395 // Choose the index that covers most. 396 bestID, bestCount, bestTp, bestNumDefCauss, bestMask := -1, 0, DefCausType, 0, int64(0) 397 for i, set := range nodes { 398 if marked[i] { 399 continue 400 } 401 curMask := set.mask & mask 402 if curMask != set.mask { 403 marked[i] = true 404 continue 405 } 406 bits := bits.OnesCount64(uint64(curMask)) 407 // This set cannot cover any thing, just skip it. 408 if bits == 0 { 409 marked[i] = true 410 continue 411 } 412 // We greedy select the stats info based on: 413 // (1): The stats type, always prefer the primary key or index. 414 // (2): The number of memex that it covers, the more the better. 415 // (3): The number of columns that it contains, the less the better. 416 if (bestTp == DefCausType && set.Tp != DefCausType) || bestCount < bits || (bestCount == bits && bestNumDefCauss > set.numDefCauss) { 417 bestID, bestCount, bestTp, bestNumDefCauss, bestMask = i, bits, set.Tp, set.numDefCauss, curMask 418 } 419 } 420 if bestCount == 0 { 421 break 422 } 423 424 // UFIDelate the mask, remove the bit that nodes[bestID].mask has. 425 mask &^= bestMask 426 427 newBlocks = append(newBlocks, nodes[bestID]) 428 marked[bestID] = true 429 } 430 return 431 }