github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/table.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "fmt" 18 "math" 19 "sort" 20 "strings" 21 "sync" 22 23 "github.com/cznic/mathutil" 24 "github.com/whtcorpsinc/BerolinaSQL/allegrosql" 25 "github.com/whtcorpsinc/BerolinaSQL/perceptron" 26 "github.com/whtcorpsinc/errors" 27 "github.com/whtcorpsinc/milevadb/blockcodec" 28 "github.com/whtcorpsinc/milevadb/ekv" 29 "github.com/whtcorpsinc/milevadb/memex" 30 "github.com/whtcorpsinc/milevadb/soliton/chunk" 31 "github.com/whtcorpsinc/milevadb/soliton/codec" 32 "github.com/whtcorpsinc/milevadb/soliton/ranger" 33 "github.com/whtcorpsinc/milevadb/stochastikctx" 34 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 35 "github.com/whtcorpsinc/milevadb/types" 36 "go.uber.org/atomic" 37 ) 38 39 const ( 40 pseudoEqualRate = 1000 41 pseudoLessRate = 3 42 pseudoBetweenRate = 40 43 pseudoDefCausSize = 8.0 44 45 outOfRangeBetweenRate = 100 46 ) 47 48 const ( 49 // PseudoVersion means the pseudo statistics version is 0. 50 PseudoVersion uint64 = 0 51 52 // PseudoRowCount export for other pkg to use. 53 // When we haven't analyzed a causet, we use pseudo statistics to estimate costs. 54 // It has event count 10000, equal condition selects 1/1000 of total rows, less condition selects 1/3 of total rows, 55 // between condition selects 1/40 of total rows. 56 PseudoRowCount = 10000 57 ) 58 59 // Block represents statistics for a causet. 60 type Block struct { 61 HistDefCausl 62 Version uint64 63 Name string 64 ExtendedStats *ExtendedStatsDefCausl 65 } 66 67 // ExtendedStatsKey is the key for cached item of a allegrosql.stats_extended record. 68 type ExtendedStatsKey struct { 69 StatsName string 70 EDB string 71 } 72 73 // ExtendedStatsItem is the cached item of a allegrosql.stats_extended record. 74 type ExtendedStatsItem struct { 75 DefCausIDs []int64 76 Tp uint8 77 ScalarVals float64 78 StringVals string 79 } 80 81 // ExtendedStatsDefCausl is a collection of cached items for allegrosql.stats_extended records. 82 type ExtendedStatsDefCausl struct { 83 Stats map[ExtendedStatsKey]*ExtendedStatsItem 84 LastUFIDelateVersion uint64 85 } 86 87 // NewExtendedStatsDefCausl allocate an ExtendedStatsDefCausl struct. 88 func NewExtendedStatsDefCausl() *ExtendedStatsDefCausl { 89 return &ExtendedStatsDefCausl{Stats: make(map[ExtendedStatsKey]*ExtendedStatsItem)} 90 } 91 92 // HistDefCausl is a collection of histogram. It collects enough information for plan to calculate the selectivity. 93 type HistDefCausl struct { 94 PhysicalID int64 95 DeferredCausets map[int64]*DeferredCauset 96 Indices map[int64]*Index 97 // Idx2DeferredCausetIDs maps the index id to its column ids. It's used to calculate the selectivity in causet. 98 Idx2DeferredCausetIDs map[int64][]int64 99 // DefCausID2IdxID maps the column id to index id whose first column is it. It's used to calculate the selectivity in causet. 100 DefCausID2IdxID map[int64]int64 101 Count int64 102 ModifyCount int64 // Total modify count in a causet. 103 104 // HavePhysicalID is true means this HistDefCausl is from single causet and have its ID's information. 105 // The physical id is used when try to load column stats from storage. 106 HavePhysicalID bool 107 Pseudo bool 108 } 109 110 // MemoryUsage returns the total memory usage of this Block. 111 // it will only calc the size of DeferredCausets and Indices stats data of causet. 112 // We ignore the size of other spacetimedata in Block 113 func (t *Block) MemoryUsage() (sum int64) { 114 for _, col := range t.DeferredCausets { 115 if col != nil { 116 sum += col.MemoryUsage() 117 } 118 } 119 for _, index := range t.Indices { 120 if index != nil { 121 sum += index.MemoryUsage() 122 } 123 } 124 return 125 } 126 127 // Copy copies the current causet. 128 func (t *Block) Copy() *Block { 129 newHistDefCausl := HistDefCausl{ 130 PhysicalID: t.PhysicalID, 131 HavePhysicalID: t.HavePhysicalID, 132 Count: t.Count, 133 DeferredCausets: make(map[int64]*DeferredCauset, len(t.DeferredCausets)), 134 Indices: make(map[int64]*Index, len(t.Indices)), 135 Pseudo: t.Pseudo, 136 ModifyCount: t.ModifyCount, 137 } 138 for id, col := range t.DeferredCausets { 139 newHistDefCausl.DeferredCausets[id] = col 140 } 141 for id, idx := range t.Indices { 142 newHistDefCausl.Indices[id] = idx 143 } 144 nt := &Block{ 145 HistDefCausl: newHistDefCausl, 146 Version: t.Version, 147 Name: t.Name, 148 } 149 if t.ExtendedStats != nil { 150 newExtStatsDefCausl := &ExtendedStatsDefCausl{ 151 Stats: make(map[ExtendedStatsKey]*ExtendedStatsItem), 152 LastUFIDelateVersion: t.ExtendedStats.LastUFIDelateVersion, 153 } 154 for key, item := range t.ExtendedStats.Stats { 155 newExtStatsDefCausl.Stats[key] = item 156 } 157 nt.ExtendedStats = newExtStatsDefCausl 158 } 159 return nt 160 } 161 162 // String implements Stringer interface. 163 func (t *Block) String() string { 164 strs := make([]string, 0, len(t.DeferredCausets)+1) 165 strs = append(strs, fmt.Sprintf("Block:%d Count:%d", t.PhysicalID, t.Count)) 166 defcaus := make([]*DeferredCauset, 0, len(t.DeferredCausets)) 167 for _, col := range t.DeferredCausets { 168 defcaus = append(defcaus, col) 169 } 170 sort.Slice(defcaus, func(i, j int) bool { return defcaus[i].ID < defcaus[j].ID }) 171 for _, col := range defcaus { 172 strs = append(strs, col.String()) 173 } 174 idxs := make([]*Index, 0, len(t.Indices)) 175 for _, idx := range t.Indices { 176 idxs = append(idxs, idx) 177 } 178 sort.Slice(idxs, func(i, j int) bool { return idxs[i].ID < idxs[j].ID }) 179 for _, idx := range idxs { 180 strs = append(strs, idx.String()) 181 } 182 // TODO: concat content of ExtendedStatsDefCausl 183 return strings.Join(strs, "\n") 184 } 185 186 // IndexStartWithDeferredCauset finds the first index whose first column is the given column. 187 func (t *Block) IndexStartWithDeferredCauset(colName string) *Index { 188 for _, index := range t.Indices { 189 if index.Info.DeferredCausets[0].Name.L == colName { 190 return index 191 } 192 } 193 return nil 194 } 195 196 // DeferredCausetByName finds the statistics.DeferredCauset for the given column. 197 func (t *Block) DeferredCausetByName(colName string) *DeferredCauset { 198 for _, c := range t.DeferredCausets { 199 if c.Info.Name.L == colName { 200 return c 201 } 202 } 203 return nil 204 } 205 206 type blockDeferredCausetID struct { 207 TableID int64 208 DeferredCausetID int64 209 } 210 211 type neededDeferredCausetMap struct { 212 m sync.Mutex 213 defcaus map[blockDeferredCausetID]struct{} 214 } 215 216 func (n *neededDeferredCausetMap) AllDefCauss() []blockDeferredCausetID { 217 n.m.Lock() 218 keys := make([]blockDeferredCausetID, 0, len(n.defcaus)) 219 for key := range n.defcaus { 220 keys = append(keys, key) 221 } 222 n.m.Unlock() 223 return keys 224 } 225 226 func (n *neededDeferredCausetMap) insert(col blockDeferredCausetID) { 227 n.m.Lock() 228 n.defcaus[col] = struct{}{} 229 n.m.Unlock() 230 } 231 232 func (n *neededDeferredCausetMap) Delete(col blockDeferredCausetID) { 233 n.m.Lock() 234 delete(n.defcaus, col) 235 n.m.Unlock() 236 } 237 238 // RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid 239 // and use pseudo estimation. 240 var RatioOfPseudoEstimate = atomic.NewFloat64(0.7) 241 242 // IsOutdated returns true if the causet stats is outdated. 243 func (t *Block) IsOutdated() bool { 244 if t.Count > 0 && float64(t.ModifyCount)/float64(t.Count) > RatioOfPseudoEstimate.Load() { 245 return true 246 } 247 return false 248 } 249 250 // DeferredCausetGreaterRowCount estimates the event count where the column greater than value. 251 func (t *Block) DeferredCausetGreaterRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) float64 { 252 c, ok := t.DeferredCausets[colID] 253 if !ok || c.IsInvalid(sc, t.Pseudo) { 254 return float64(t.Count) / pseudoLessRate 255 } 256 return c.greaterRowCount(value) * c.GetIncreaseFactor(t.Count) 257 } 258 259 // DeferredCausetLessRowCount estimates the event count where the column less than value. Note that null values are not counted. 260 func (t *Block) DeferredCausetLessRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) float64 { 261 c, ok := t.DeferredCausets[colID] 262 if !ok || c.IsInvalid(sc, t.Pseudo) { 263 return float64(t.Count) / pseudoLessRate 264 } 265 return c.lessRowCount(value) * c.GetIncreaseFactor(t.Count) 266 } 267 268 // DeferredCausetBetweenRowCount estimates the event count where column greater or equal to a and less than b. 269 func (t *Block) DeferredCausetBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Causet, colID int64) float64 { 270 c, ok := t.DeferredCausets[colID] 271 if !ok || c.IsInvalid(sc, t.Pseudo) { 272 return float64(t.Count) / pseudoBetweenRate 273 } 274 count := c.BetweenRowCount(a, b) 275 if a.IsNull() { 276 count += float64(c.NullCount) 277 } 278 return count * c.GetIncreaseFactor(t.Count) 279 } 280 281 // DeferredCausetEqualRowCount estimates the event count where the column equals to value. 282 func (t *Block) DeferredCausetEqualRowCount(sc *stmtctx.StatementContext, value types.Causet, colID int64) (float64, error) { 283 c, ok := t.DeferredCausets[colID] 284 if !ok || c.IsInvalid(sc, t.Pseudo) { 285 return float64(t.Count) / pseudoEqualRate, nil 286 } 287 result, err := c.equalRowCount(sc, value, t.ModifyCount) 288 result *= c.GetIncreaseFactor(t.Count) 289 return result, errors.Trace(err) 290 } 291 292 // GetRowCountByIntDeferredCausetRanges estimates the event count by a slice of IntDeferredCausetRange. 293 func (coll *HistDefCausl) GetRowCountByIntDeferredCausetRanges(sc *stmtctx.StatementContext, colID int64, intRanges []*ranger.Range) (float64, error) { 294 c, ok := coll.DeferredCausets[colID] 295 if !ok || c.IsInvalid(sc, coll.Pseudo) { 296 if len(intRanges) == 0 { 297 return 0, nil 298 } 299 if intRanges[0].LowVal[0].HoTT() == types.HoTTInt64 { 300 return getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count)), nil 301 } 302 return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil 303 } 304 result, err := c.GetDeferredCausetRowCount(sc, intRanges, coll.ModifyCount, true) 305 result *= c.GetIncreaseFactor(coll.Count) 306 return result, errors.Trace(err) 307 } 308 309 // GetRowCountByDeferredCausetRanges estimates the event count by a slice of Range. 310 func (coll *HistDefCausl) GetRowCountByDeferredCausetRanges(sc *stmtctx.StatementContext, colID int64, colRanges []*ranger.Range) (float64, error) { 311 c, ok := coll.DeferredCausets[colID] 312 if !ok || c.IsInvalid(sc, coll.Pseudo) { 313 return GetPseudoRowCountByDeferredCausetRanges(sc, float64(coll.Count), colRanges, 0) 314 } 315 result, err := c.GetDeferredCausetRowCount(sc, colRanges, coll.ModifyCount, false) 316 result *= c.GetIncreaseFactor(coll.Count) 317 return result, errors.Trace(err) 318 } 319 320 // GetRowCountByIndexRanges estimates the event count by a slice of Range. 321 func (coll *HistDefCausl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) { 322 idx := coll.Indices[idxID] 323 if idx == nil || idx.IsInvalid(coll.Pseudo) { 324 defcausLen := -1 325 if idx != nil && idx.Info.Unique { 326 defcausLen = len(idx.Info.DeferredCausets) 327 } 328 return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), defcausLen) 329 } 330 var result float64 331 var err error 332 if idx.CMSketch != nil && idx.StatsVer == Version1 { 333 result, err = coll.getIndexRowCount(sc, idxID, indexRanges) 334 } else { 335 result, err = idx.GetRowCount(sc, indexRanges, coll.ModifyCount) 336 } 337 result *= idx.GetIncreaseFactor(coll.Count) 338 return result, errors.Trace(err) 339 } 340 341 // PseudoAvgCountPerValue gets a pseudo average count if histogram not exists. 342 func (t *Block) PseudoAvgCountPerValue() float64 { 343 return float64(t.Count) / pseudoEqualRate 344 } 345 346 // GetOrdinalOfRangeCond gets the ordinal of the position range condition, 347 // if not exist, it returns the end position. 348 func GetOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int { 349 for i := range ran.LowVal { 350 a, b := ran.LowVal[i], ran.HighVal[i] 351 cmp, err := a.CompareCauset(sc, &b) 352 if err != nil { 353 return 0 354 } 355 if cmp != 0 { 356 return i 357 } 358 } 359 return len(ran.LowVal) 360 } 361 362 // ID2UniqueID generates a new HistDefCausl whose `DeferredCausets` is built from UniqueID of given columns. 363 func (coll *HistDefCausl) ID2UniqueID(columns []*memex.DeferredCauset) *HistDefCausl { 364 defcaus := make(map[int64]*DeferredCauset) 365 for _, col := range columns { 366 colHist, ok := coll.DeferredCausets[col.ID] 367 if ok { 368 defcaus[col.UniqueID] = colHist 369 } 370 } 371 newDefCausl := &HistDefCausl{ 372 PhysicalID: coll.PhysicalID, 373 HavePhysicalID: coll.HavePhysicalID, 374 Pseudo: coll.Pseudo, 375 Count: coll.Count, 376 ModifyCount: coll.ModifyCount, 377 DeferredCausets: defcaus, 378 } 379 return newDefCausl 380 } 381 382 // GenerateHistDefCauslFromDeferredCausetInfo generates a new HistDefCausl whose DefCausID2IdxID and IdxID2DefCausIDs is built from the given parameter. 383 func (coll *HistDefCausl) GenerateHistDefCauslFromDeferredCausetInfo(infos []*perceptron.DeferredCausetInfo, columns []*memex.DeferredCauset) *HistDefCausl { 384 newDefCausHistMap := make(map[int64]*DeferredCauset) 385 colInfoID2UniqueID := make(map[int64]int64, len(columns)) 386 colNames2UniqueID := make(map[string]int64) 387 for _, col := range columns { 388 colInfoID2UniqueID[col.ID] = col.UniqueID 389 } 390 for _, colInfo := range infos { 391 uniqueID, ok := colInfoID2UniqueID[colInfo.ID] 392 if ok { 393 colNames2UniqueID[colInfo.Name.L] = uniqueID 394 } 395 } 396 for id, colHist := range coll.DeferredCausets { 397 uniqueID, ok := colInfoID2UniqueID[id] 398 // DefCauslect the statistics by the given columns. 399 if ok { 400 newDefCausHistMap[uniqueID] = colHist 401 } 402 } 403 newIdxHistMap := make(map[int64]*Index) 404 idx2DeferredCausets := make(map[int64][]int64) 405 colID2IdxID := make(map[int64]int64) 406 for _, idxHist := range coll.Indices { 407 ids := make([]int64, 0, len(idxHist.Info.DeferredCausets)) 408 for _, idxDefCaus := range idxHist.Info.DeferredCausets { 409 uniqueID, ok := colNames2UniqueID[idxDefCaus.Name.L] 410 if !ok { 411 break 412 } 413 ids = append(ids, uniqueID) 414 } 415 // If the length of the id list is 0, this index won't be used in this query. 416 if len(ids) == 0 { 417 continue 418 } 419 colID2IdxID[ids[0]] = idxHist.ID 420 newIdxHistMap[idxHist.ID] = idxHist 421 idx2DeferredCausets[idxHist.ID] = ids 422 } 423 newDefCausl := &HistDefCausl{ 424 PhysicalID: coll.PhysicalID, 425 HavePhysicalID: coll.HavePhysicalID, 426 Pseudo: coll.Pseudo, 427 Count: coll.Count, 428 ModifyCount: coll.ModifyCount, 429 DeferredCausets: newDefCausHistMap, 430 Indices: newIdxHistMap, 431 DefCausID2IdxID: colID2IdxID, 432 Idx2DeferredCausetIDs: idx2DeferredCausets, 433 } 434 return newDefCausl 435 } 436 437 // isSingleDefCausIdxNullRange checks if a range is [NULL, NULL] on a single-column index. 438 func isSingleDefCausIdxNullRange(idx *Index, ran *ranger.Range) bool { 439 if len(idx.Info.DeferredCausets) > 1 { 440 return false 441 } 442 l, h := ran.LowVal[0], ran.HighVal[0] 443 if l.IsNull() && h.IsNull() { 444 return true 445 } 446 return false 447 } 448 449 // outOfRangeEQSelectivity estimates selectivities for out-of-range values. 450 // It assumes all modifications are insertions and all new-inserted rows are uniformly distributed 451 // and has the same distribution with analyzed rows, which means each unique value should have the 452 // same number of rows(Tot/NDV) of it. 453 func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 { 454 if modifyRows == 0 { 455 return 0 // it must be 0 since the histogram contains the whole data 456 } 457 if ndv < outOfRangeBetweenRate { 458 ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV 459 } 460 selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. 461 if selectivity*float64(totalRows) > float64(modifyRows) { 462 selectivity = float64(modifyRows) / float64(totalRows) 463 } 464 return selectivity 465 } 466 467 // getEqualCondSelectivity gets the selectivity of the equal conditions. 468 func (coll *HistDefCausl) getEqualCondSelectivity(idx *Index, bytes []byte, usedDefCaussLen int) float64 { 469 coverAll := len(idx.Info.DeferredCausets) == usedDefCaussLen 470 // In this case, the event count is at most 1. 471 if idx.Info.Unique && coverAll { 472 return 1.0 / float64(idx.TotalRowCount()) 473 } 474 val := types.NewBytesCauset(bytes) 475 if idx.outOfRange(val) { 476 // When the value is out of range, we could not found this value in the CM Sketch, 477 // so we use heuristic methods to estimate the selectivity. 478 if idx.NDV > 0 && coverAll { 479 return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount())) 480 } 481 // The equal condition only uses prefix columns of the index. 482 colIDs := coll.Idx2DeferredCausetIDs[idx.ID] 483 var ndv int64 484 for i, colID := range colIDs { 485 if i >= usedDefCaussLen { 486 break 487 } 488 ndv = mathutil.MaxInt64(ndv, coll.DeferredCausets[colID].NDV) 489 } 490 return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount())) 491 } 492 return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount()) 493 } 494 495 func (coll *HistDefCausl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) { 496 idx := coll.Indices[idxID] 497 totalCount := float64(0) 498 for _, ran := range indexRanges { 499 rangePosition := GetOrdinalOfRangeCond(sc, ran) 500 var rangeVals []types.Causet 501 // Try to enum the last range values. 502 if rangePosition != len(ran.LowVal) { 503 rangeVals = enumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude) 504 if rangeVals != nil { 505 rangePosition++ 506 } 507 } 508 // If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range 509 // on single-column index, use previous way as well, because CMSketch does not contain null 510 // values in this case. 511 if rangePosition == 0 || isSingleDefCausIdxNullRange(idx, ran) { 512 count, err := idx.GetRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount) 513 if err != nil { 514 return 0, errors.Trace(err) 515 } 516 totalCount += count 517 continue 518 } 519 var selectivity float64 520 // use CM Sketch to estimate the equal conditions 521 if rangeVals == nil { 522 bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...) 523 if err != nil { 524 return 0, errors.Trace(err) 525 } 526 selectivity = coll.getEqualCondSelectivity(idx, bytes, rangePosition) 527 } else { 528 bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition-1]...) 529 if err != nil { 530 return 0, errors.Trace(err) 531 } 532 prefixLen := len(bytes) 533 for _, val := range rangeVals { 534 bytes = bytes[:prefixLen] 535 bytes, err = codec.EncodeKey(sc, bytes, val) 536 if err != nil { 537 return 0, err 538 } 539 selectivity += coll.getEqualCondSelectivity(idx, bytes, rangePosition) 540 } 541 } 542 // use histogram to estimate the range condition 543 if rangePosition != len(ran.LowVal) { 544 rang := ranger.Range{ 545 LowVal: []types.Causet{ran.LowVal[rangePosition]}, 546 LowExclude: ran.LowExclude, 547 HighVal: []types.Causet{ran.HighVal[rangePosition]}, 548 HighExclude: ran.HighExclude, 549 } 550 var count float64 551 var err error 552 colIDs := coll.Idx2DeferredCausetIDs[idxID] 553 var colID int64 554 if rangePosition >= len(colIDs) { 555 colID = -1 556 } else { 557 colID = colIDs[rangePosition] 558 } 559 // prefer index stats over column stats 560 if idx, ok := coll.DefCausID2IdxID[colID]; ok { 561 count, err = coll.GetRowCountByIndexRanges(sc, idx, []*ranger.Range{&rang}) 562 } else { 563 count, err = coll.GetRowCountByDeferredCausetRanges(sc, colID, []*ranger.Range{&rang}) 564 } 565 if err != nil { 566 return 0, errors.Trace(err) 567 } 568 selectivity = selectivity * count / float64(idx.TotalRowCount()) 569 } 570 totalCount += selectivity * float64(idx.TotalRowCount()) 571 } 572 if totalCount > idx.TotalRowCount() { 573 totalCount = idx.TotalRowCount() 574 } 575 return totalCount, nil 576 } 577 578 const fakePhysicalID int64 = -1 579 580 // PseudoTable creates a pseudo causet statistics. 581 func PseudoTable(tblInfo *perceptron.TableInfo) *Block { 582 pseudoHistDefCausl := HistDefCausl{ 583 Count: PseudoRowCount, 584 PhysicalID: tblInfo.ID, 585 HavePhysicalID: true, 586 DeferredCausets: make(map[int64]*DeferredCauset, len(tblInfo.DeferredCausets)), 587 Indices: make(map[int64]*Index, len(tblInfo.Indices)), 588 Pseudo: true, 589 } 590 t := &Block{ 591 HistDefCausl: pseudoHistDefCausl, 592 } 593 for _, col := range tblInfo.DeferredCausets { 594 if col.State == perceptron.StatePublic { 595 t.DeferredCausets[col.ID] = &DeferredCauset{ 596 PhysicalID: fakePhysicalID, 597 Info: col, 598 IsHandle: tblInfo.PKIsHandle && allegrosql.HasPriKeyFlag(col.Flag), 599 Histogram: *NewHistogram(col.ID, 0, 0, 0, &col.FieldType, 0, 0), 600 } 601 } 602 } 603 for _, idx := range tblInfo.Indices { 604 if idx.State == perceptron.StatePublic { 605 t.Indices[idx.ID] = &Index{ 606 Info: idx, 607 Histogram: *NewHistogram(idx.ID, 0, 0, 0, types.NewFieldType(allegrosql.TypeBlob), 0, 0)} 608 } 609 } 610 return t 611 } 612 613 func getPseudoRowCountByIndexRanges(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, 614 blockRowCount float64, defcausLen int) (float64, error) { 615 if blockRowCount == 0 { 616 return 0, nil 617 } 618 var totalCount float64 619 for _, indexRange := range indexRanges { 620 count := blockRowCount 621 i, err := indexRange.PrefixEqualLen(sc) 622 if err != nil { 623 return 0, errors.Trace(err) 624 } 625 if i == defcausLen && !indexRange.LowExclude && !indexRange.HighExclude { 626 totalCount += 1.0 627 continue 628 } 629 if i >= len(indexRange.LowVal) { 630 i = len(indexRange.LowVal) - 1 631 } 632 rowCount, err := GetPseudoRowCountByDeferredCausetRanges(sc, blockRowCount, []*ranger.Range{indexRange}, i) 633 if err != nil { 634 return 0, errors.Trace(err) 635 } 636 count = count / blockRowCount * rowCount 637 // If the condition is a = 1, b = 1, c = 1, d = 1, we think every a=1, b=1, c=1 only filtrate 1/100 data, 638 // so as to avoid collapsing too fast. 639 for j := 0; j < i; j++ { 640 count = count / float64(100) 641 } 642 totalCount += count 643 } 644 if totalCount > blockRowCount { 645 totalCount = blockRowCount / 3.0 646 } 647 return totalCount, nil 648 } 649 650 // GetPseudoRowCountByDeferredCausetRanges calculate the event count by the ranges if there's no statistics information for this column. 651 func GetPseudoRowCountByDeferredCausetRanges(sc *stmtctx.StatementContext, blockRowCount float64, columnRanges []*ranger.Range, colIdx int) (float64, error) { 652 var rowCount float64 653 var err error 654 for _, ran := range columnRanges { 655 if ran.LowVal[colIdx].HoTT() == types.HoTTNull && ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue { 656 rowCount += blockRowCount 657 } else if ran.LowVal[colIdx].HoTT() == types.HoTTMinNotNull { 658 nullCount := blockRowCount / pseudoEqualRate 659 if ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue { 660 rowCount += blockRowCount - nullCount 661 } else if err == nil { 662 lessCount := blockRowCount / pseudoLessRate 663 rowCount += lessCount - nullCount 664 } 665 } else if ran.HighVal[colIdx].HoTT() == types.HoTTMaxValue { 666 rowCount += blockRowCount / pseudoLessRate 667 } else { 668 compare, err1 := ran.LowVal[colIdx].CompareCauset(sc, &ran.HighVal[colIdx]) 669 if err1 != nil { 670 return 0, errors.Trace(err1) 671 } 672 if compare == 0 { 673 rowCount += blockRowCount / pseudoEqualRate 674 } else { 675 rowCount += blockRowCount / pseudoBetweenRate 676 } 677 } 678 if err != nil { 679 return 0, errors.Trace(err) 680 } 681 } 682 if rowCount > blockRowCount { 683 rowCount = blockRowCount 684 } 685 return rowCount, nil 686 } 687 688 func getPseudoRowCountBySignedIntRanges(intRanges []*ranger.Range, blockRowCount float64) float64 { 689 var rowCount float64 690 for _, rg := range intRanges { 691 var cnt float64 692 low := rg.LowVal[0].GetInt64() 693 if rg.LowVal[0].HoTT() == types.HoTTNull || rg.LowVal[0].HoTT() == types.HoTTMinNotNull { 694 low = math.MinInt64 695 } 696 high := rg.HighVal[0].GetInt64() 697 if rg.HighVal[0].HoTT() == types.HoTTMaxValue { 698 high = math.MaxInt64 699 } 700 if low == math.MinInt64 && high == math.MaxInt64 { 701 cnt = blockRowCount 702 } else if low == math.MinInt64 { 703 cnt = blockRowCount / pseudoLessRate 704 } else if high == math.MaxInt64 { 705 cnt = blockRowCount / pseudoLessRate 706 } else { 707 if low == high { 708 cnt = 1 // When primary key is handle, the equal event count is at most one. 709 } else { 710 cnt = blockRowCount / pseudoBetweenRate 711 } 712 } 713 if high-low > 0 && cnt > float64(high-low) { 714 cnt = float64(high - low) 715 } 716 rowCount += cnt 717 } 718 if rowCount > blockRowCount { 719 rowCount = blockRowCount 720 } 721 return rowCount 722 } 723 724 func getPseudoRowCountByUnsignedIntRanges(intRanges []*ranger.Range, blockRowCount float64) float64 { 725 var rowCount float64 726 for _, rg := range intRanges { 727 var cnt float64 728 low := rg.LowVal[0].GetUint64() 729 if rg.LowVal[0].HoTT() == types.HoTTNull || rg.LowVal[0].HoTT() == types.HoTTMinNotNull { 730 low = 0 731 } 732 high := rg.HighVal[0].GetUint64() 733 if rg.HighVal[0].HoTT() == types.HoTTMaxValue { 734 high = math.MaxUint64 735 } 736 if low == 0 && high == math.MaxUint64 { 737 cnt = blockRowCount 738 } else if low == 0 { 739 cnt = blockRowCount / pseudoLessRate 740 } else if high == math.MaxUint64 { 741 cnt = blockRowCount / pseudoLessRate 742 } else { 743 if low == high { 744 cnt = 1 // When primary key is handle, the equal event count is at most one. 745 } else { 746 cnt = blockRowCount / pseudoBetweenRate 747 } 748 } 749 if high > low && cnt > float64(high-low) { 750 cnt = float64(high - low) 751 } 752 rowCount += cnt 753 } 754 if rowCount > blockRowCount { 755 rowCount = blockRowCount 756 } 757 return rowCount 758 } 759 760 // GetAvgRowSize computes average event size for given columns. 761 func (coll *HistDefCausl) GetAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, isEncodedKey bool, isForScan bool) (size float64) { 762 stochastikVars := ctx.GetStochastikVars() 763 if coll.Pseudo || len(coll.DeferredCausets) == 0 || coll.Count == 0 { 764 size = pseudoDefCausSize * float64(len(defcaus)) 765 } else { 766 for _, col := range defcaus { 767 colHist, ok := coll.DeferredCausets[col.UniqueID] 768 // Normally this would not happen, it is for compatibility with old version stats which 769 // does not include TotDefCausSize. 770 if !ok || (!colHist.IsHandle && colHist.TotDefCausSize == 0 && (colHist.NullCount != coll.Count)) { 771 size += pseudoDefCausSize 772 continue 773 } 774 // We differentiate if the column is encoded as key or value, because the resulted size 775 // is different. 776 if stochastikVars.EnableChunkRPC && !isForScan { 777 size += colHist.AvgDefCausSizeChunkFormat(coll.Count) 778 } else { 779 size += colHist.AvgDefCausSize(coll.Count, isEncodedKey) 780 } 781 } 782 } 783 if stochastikVars.EnableChunkRPC && !isForScan { 784 // Add 1/8 byte for each column's nullBitMap byte. 785 return size + float64(len(defcaus))/8 786 } 787 // Add 1 byte for each column's flag byte. See `encode` for details. 788 return size + float64(len(defcaus)) 789 } 790 791 // GetAvgRowSizeListInDisk computes average event size for given columns. 792 func (coll *HistDefCausl) GetAvgRowSizeListInDisk(defcaus []*memex.DeferredCauset) (size float64) { 793 if coll.Pseudo || len(coll.DeferredCausets) == 0 || coll.Count == 0 { 794 for _, col := range defcaus { 795 size += float64(chunk.EstimateTypeWidth(col.GetType())) 796 } 797 } else { 798 for _, col := range defcaus { 799 colHist, ok := coll.DeferredCausets[col.UniqueID] 800 // Normally this would not happen, it is for compatibility with old version stats which 801 // does not include TotDefCausSize. 802 if !ok || (!colHist.IsHandle && colHist.TotDefCausSize == 0 && (colHist.NullCount != coll.Count)) { 803 size += float64(chunk.EstimateTypeWidth(col.GetType())) 804 continue 805 } 806 size += colHist.AvgDefCausSizeListInDisk(coll.Count) 807 } 808 } 809 // Add 8 byte for each column's size record. See `ListInDisk` for details. 810 return size + float64(8*len(defcaus)) 811 } 812 813 // GetTableAvgRowSize computes average event size for a causet scan, exclude the index key-value pairs. 814 func (coll *HistDefCausl) GetTableAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, storeType ekv.StoreType, handleInDefCauss bool) (size float64) { 815 size = coll.GetAvgRowSize(ctx, defcaus, false, true) 816 switch storeType { 817 case ekv.EinsteinDB: 818 size += blockcodec.RecordRowKeyLen 819 // The `defcaus` for EinsteinDB always contain the row_id, so prefix event size subtract its length. 820 size -= 8 821 case ekv.TiFlash: 822 if !handleInDefCauss { 823 size += 8 /* row_id length */ 824 } 825 } 826 return 827 } 828 829 // GetIndexAvgRowSize computes average event size for a index scan. 830 func (coll *HistDefCausl) GetIndexAvgRowSize(ctx stochastikctx.Context, defcaus []*memex.DeferredCauset, isUnique bool) (size float64) { 831 size = coll.GetAvgRowSize(ctx, defcaus, true, true) 832 // blockPrefix(1) + blockID(8) + indexPrefix(2) + indexID(8) 833 // Because the defcaus for index scan always contain the handle, so we don't add the rowID here. 834 size += 19 835 if !isUnique { 836 // add the len("_") 837 size++ 838 } 839 return 840 }