github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/histogram.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "bytes" 18 "fmt" 19 "math" 20 "sort" 21 "strings" 22 "time" 23 "unsafe" 24 25 "github.com/twmb/murmur3" 26 "github.com/whtcorpsinc/BerolinaSQL/allegrosql" 27 "github.com/whtcorpsinc/BerolinaSQL/perceptron" 28 "github.com/whtcorpsinc/BerolinaSQL/terror" 29 "github.com/whtcorpsinc/errors" 30 "github.com/whtcorpsinc/fidelpb/go-fidelpb" 31 "github.com/whtcorpsinc/milevadb/blockcodec" 32 "github.com/whtcorpsinc/milevadb/ekv" 33 "github.com/whtcorpsinc/milevadb/soliton/chunk" 34 "github.com/whtcorpsinc/milevadb/soliton/codec" 35 "github.com/whtcorpsinc/milevadb/soliton/collate" 36 "github.com/whtcorpsinc/milevadb/soliton/logutil" 37 "github.com/whtcorpsinc/milevadb/soliton/ranger" 38 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 39 "github.com/whtcorpsinc/milevadb/stochastikctx/variable" 40 "github.com/whtcorpsinc/milevadb/types" 41 "go.uber.org/zap" 42 ) 43 44 // Histogram represents statistics for a column or index. 45 type Histogram struct { 46 ID int64 // DeferredCauset ID. 47 NDV int64 // Number of distinct values. 48 NullCount int64 // Number of null values. 49 // LastUFIDelateVersion is the version that this histogram uFIDelated last time. 50 LastUFIDelateVersion uint64 51 52 Tp *types.FieldType 53 54 // Histogram elements. 55 // 56 // A bucket bound is the smallest and greatest values stored in the bucket. The lower and upper bound 57 // are stored in one column. 58 // 59 // A bucket count is the number of items stored in all previous buckets and the current bucket. 60 // Bucket counts are always in increasing order. 61 // 62 // A bucket repeat is the number of repeats of the bucket value, it can be used to find popular values. 63 Bounds *chunk.Chunk 64 Buckets []Bucket 65 66 // Used for estimating fraction of the interval [lower, upper] that lies within the [lower, value]. 67 // For some types like `Int`, we do not build it because we can get them directly from `Bounds`. 68 scalars []scalar 69 // TotDefCausSize is the total column size for the histogram. 70 // For unfixed-len types, it includes LEN and BYTE. 71 TotDefCausSize int64 72 73 // Correlation is the statistical correlation between physical event ordering and logical ordering of 74 // the column values. This ranges from -1 to +1, and it is only valid for DeferredCauset histogram, not for 75 // Index histogram. 76 Correlation float64 77 } 78 79 // Bucket causetstore the bucket count and repeat. 80 type Bucket struct { 81 Count int64 82 Repeat int64 83 } 84 85 type scalar struct { 86 lower float64 87 upper float64 88 commonPfxLen int // commonPfxLen is the common prefix length of the lower bound and upper bound when the value type is HoTTString or HoTTBytes. 89 } 90 91 // NewHistogram creates a new histogram. 92 func NewHistogram(id, ndv, nullCount int64, version uint64, tp *types.FieldType, bucketSize int, totDefCausSize int64) *Histogram { 93 return &Histogram{ 94 ID: id, 95 NDV: ndv, 96 NullCount: nullCount, 97 LastUFIDelateVersion: version, 98 Tp: tp, 99 Bounds: chunk.NewChunkWithCapacity([]*types.FieldType{tp}, 2*bucketSize), 100 Buckets: make([]Bucket, 0, bucketSize), 101 TotDefCausSize: totDefCausSize, 102 } 103 } 104 105 // GetLower gets the lower bound of bucket `idx`. 106 func (hg *Histogram) GetLower(idx int) *types.Causet { 107 d := hg.Bounds.GetRow(2*idx).GetCauset(0, hg.Tp) 108 return &d 109 } 110 111 // GetUpper gets the upper bound of bucket `idx`. 112 func (hg *Histogram) GetUpper(idx int) *types.Causet { 113 d := hg.Bounds.GetRow(2*idx+1).GetCauset(0, hg.Tp) 114 return &d 115 } 116 117 // MemoryUsage returns the total memory usage of this Histogram. 118 // everytime changed the Histogram of the causet, it will cost O(n) 119 // complexity so calculate the memoryUsage might cost little time. 120 // We ignore the size of other spacetimedata in Histogram. 121 func (hg *Histogram) MemoryUsage() (sum int64) { 122 if hg == nil { 123 return 124 } 125 sum = hg.Bounds.MemoryUsage() + int64(cap(hg.Buckets)*int(unsafe.Sizeof(Bucket{}))) + int64(cap(hg.scalars)*int(unsafe.Sizeof(scalar{}))) 126 return 127 } 128 129 // AvgDefCausSize is the average column size of the histogram. These sizes are derived from function `encode` 130 // and `Causet::ConvertTo`, so we need to uFIDelate them if those 2 functions are changed. 131 func (c *DeferredCauset) AvgDefCausSize(count int64, isKey bool) float64 { 132 if count == 0 { 133 return 0 134 } 135 // Note that, if the handle column is encoded as value, instead of key, i.e, 136 // when the handle column is in a unique index, the real column size may be 137 // smaller than 8 because it is encoded using `EncodeVarint`. Since we don't 138 // know the exact value size now, use 8 as approximation. 139 if c.IsHandle { 140 return 8 141 } 142 histCount := c.TotalRowCount() 143 notNullRatio := 1.0 144 if histCount > 0 { 145 notNullRatio = 1.0 - float64(c.NullCount)/histCount 146 } 147 switch c.Histogram.Tp.Tp { 148 case allegrosql.TypeFloat, allegrosql.TypeDouble, allegrosql.TypeDuration, allegrosql.TypeDate, allegrosql.TypeDatetime, allegrosql.TypeTimestamp: 149 return 8 * notNullRatio 150 case allegrosql.TypeTiny, allegrosql.TypeShort, allegrosql.TypeInt24, allegrosql.TypeLong, allegrosql.TypeLonglong, allegrosql.TypeYear, allegrosql.TypeEnum, allegrosql.TypeBit, allegrosql.TypeSet: 151 if isKey { 152 return 8 * notNullRatio 153 } 154 } 155 // Keep two decimal place. 156 return math.Round(float64(c.TotDefCausSize)/float64(count)*100) / 100 157 } 158 159 // AvgDefCausSizeChunkFormat is the average column size of the histogram. These sizes are derived from function `Encode` 160 // and `DecodeToChunk`, so we need to uFIDelate them if those 2 functions are changed. 161 func (c *DeferredCauset) AvgDefCausSizeChunkFormat(count int64) float64 { 162 if count == 0 { 163 return 0 164 } 165 fixedLen := chunk.GetFixedLen(c.Histogram.Tp) 166 if fixedLen != -1 { 167 return float64(fixedLen) 168 } 169 // Keep two decimal place. 170 // Add 8 bytes for unfixed-len type's offsets. 171 // Minus Log2(avgSize) for unfixed-len type LEN. 172 avgSize := float64(c.TotDefCausSize) / float64(count) 173 if avgSize < 1 { 174 return math.Round(avgSize*100)/100 + 8 175 } 176 return math.Round((avgSize-math.Log2(avgSize))*100)/100 + 8 177 } 178 179 // AvgDefCausSizeListInDisk is the average column size of the histogram. These sizes are derived 180 // from `chunk.ListInDisk` so we need to uFIDelate them if those 2 functions are changed. 181 func (c *DeferredCauset) AvgDefCausSizeListInDisk(count int64) float64 { 182 if count == 0 { 183 return 0 184 } 185 histCount := c.TotalRowCount() 186 notNullRatio := 1.0 187 if histCount > 0 { 188 notNullRatio = 1.0 - float64(c.NullCount)/histCount 189 } 190 size := chunk.GetFixedLen(c.Histogram.Tp) 191 if size != -1 { 192 return float64(size) * notNullRatio 193 } 194 // Keep two decimal place. 195 // Minus Log2(avgSize) for unfixed-len type LEN. 196 avgSize := float64(c.TotDefCausSize) / float64(count) 197 if avgSize < 1 { 198 return math.Round((avgSize)*100) / 100 199 } 200 return math.Round((avgSize-math.Log2(avgSize))*100) / 100 201 } 202 203 // AppendBucket appends a bucket into `hg`. 204 func (hg *Histogram) AppendBucket(lower *types.Causet, upper *types.Causet, count, repeat int64) { 205 hg.Buckets = append(hg.Buckets, Bucket{Count: count, Repeat: repeat}) 206 hg.Bounds.AppendCauset(0, lower) 207 hg.Bounds.AppendCauset(0, upper) 208 } 209 210 func (hg *Histogram) uFIDelateLastBucket(upper *types.Causet, count, repeat int64) { 211 len := hg.Len() 212 hg.Bounds.TruncateTo(2*len - 1) 213 hg.Bounds.AppendCauset(0, upper) 214 hg.Buckets[len-1] = Bucket{Count: count, Repeat: repeat} 215 } 216 217 // DecodeTo decodes the histogram bucket values into `Tp`. 218 func (hg *Histogram) DecodeTo(tp *types.FieldType, timeZone *time.Location) error { 219 oldIter := chunk.NewIterator4Chunk(hg.Bounds) 220 hg.Bounds = chunk.NewChunkWithCapacity([]*types.FieldType{tp}, oldIter.Len()) 221 hg.Tp = tp 222 for event := oldIter.Begin(); event != oldIter.End(); event = oldIter.Next() { 223 causet, err := blockcodec.DecodeDeferredCausetValue(event.GetBytes(0), tp, timeZone) 224 if err != nil { 225 return errors.Trace(err) 226 } 227 hg.Bounds.AppendCauset(0, &causet) 228 } 229 return nil 230 } 231 232 // ConvertTo converts the histogram bucket values into `Tp`. 233 func (hg *Histogram) ConvertTo(sc *stmtctx.StatementContext, tp *types.FieldType) (*Histogram, error) { 234 hist := NewHistogram(hg.ID, hg.NDV, hg.NullCount, hg.LastUFIDelateVersion, tp, hg.Len(), hg.TotDefCausSize) 235 hist.Correlation = hg.Correlation 236 iter := chunk.NewIterator4Chunk(hg.Bounds) 237 for event := iter.Begin(); event != iter.End(); event = iter.Next() { 238 d := event.GetCauset(0, hg.Tp) 239 d, err := d.ConvertTo(sc, tp) 240 if err != nil { 241 return nil, errors.Trace(err) 242 } 243 hist.Bounds.AppendCauset(0, &d) 244 } 245 hist.Buckets = hg.Buckets 246 return hist, nil 247 } 248 249 // Len is the number of buckets in the histogram. 250 func (hg *Histogram) Len() int { 251 return len(hg.Buckets) 252 } 253 254 // HistogramEqual tests if two histograms are equal. 255 func HistogramEqual(a, b *Histogram, ignoreID bool) bool { 256 if ignoreID { 257 old := b.ID 258 b.ID = a.ID 259 defer func() { b.ID = old }() 260 } 261 return bytes.Equal([]byte(a.ToString(0)), []byte(b.ToString(0))) 262 } 263 264 // constants for stats version. These const can be used for solving compatibility issue. 265 const ( 266 CurStatsVersion = Version1 267 Version1 = 1 268 ) 269 270 // AnalyzeFlag is set when the statistics comes from analyze and has not been modified by feedback. 271 const AnalyzeFlag = 1 272 273 // IsAnalyzed checks whether this flag contains AnalyzeFlag. 274 func IsAnalyzed(flag int64) bool { 275 return (flag & AnalyzeFlag) > 0 276 } 277 278 // ResetAnalyzeFlag resets the AnalyzeFlag because it has been modified by feedback. 279 func ResetAnalyzeFlag(flag int64) int64 { 280 return flag &^ AnalyzeFlag 281 } 282 283 // ValueToString converts a possible encoded value to a formatted string. If the value is encoded, then 284 // idxDefCauss equals to number of origin values, else idxDefCauss is 0. 285 func ValueToString(vars *variable.StochastikVars, value *types.Causet, idxDefCauss int, idxDeferredCausetTypes []byte) (string, error) { 286 if idxDefCauss == 0 { 287 return value.ToString() 288 } 289 var loc *time.Location 290 if vars != nil { 291 loc = vars.Location() 292 } 293 // Ignore the error and treat remaining part that cannot decode successfully as bytes. 294 decodedVals, remained, err := codec.DecodeRange(value.GetBytes(), idxDefCauss, idxDeferredCausetTypes, loc) 295 // Ignore err explicit to pass errcheck. 296 _ = err 297 if len(remained) > 0 { 298 decodedVals = append(decodedVals, types.NewBytesCauset(remained)) 299 } 300 str, err := types.CausetsToString(decodedVals, true) 301 return str, err 302 } 303 304 // BucketToString change the given bucket to string format. 305 func (hg *Histogram) BucketToString(bktID, idxDefCauss int) string { 306 upperVal, err := ValueToString(nil, hg.GetUpper(bktID), idxDefCauss, nil) 307 terror.Log(errors.Trace(err)) 308 lowerVal, err := ValueToString(nil, hg.GetLower(bktID), idxDefCauss, nil) 309 terror.Log(errors.Trace(err)) 310 return fmt.Sprintf("num: %d lower_bound: %s upper_bound: %s repeats: %d", hg.bucketCount(bktID), lowerVal, upperVal, hg.Buckets[bktID].Repeat) 311 } 312 313 // ToString gets the string representation for the histogram. 314 func (hg *Histogram) ToString(idxDefCauss int) string { 315 strs := make([]string, 0, hg.Len()+1) 316 if idxDefCauss > 0 { 317 strs = append(strs, fmt.Sprintf("index:%d ndv:%d", hg.ID, hg.NDV)) 318 } else { 319 strs = append(strs, fmt.Sprintf("column:%d ndv:%d totDefCausSize:%d", hg.ID, hg.NDV, hg.TotDefCausSize)) 320 } 321 for i := 0; i < hg.Len(); i++ { 322 strs = append(strs, hg.BucketToString(i, idxDefCauss)) 323 } 324 return strings.Join(strs, "\n") 325 } 326 327 // equalRowCount estimates the event count where the column equals to value. 328 func (hg *Histogram) equalRowCount(value types.Causet) float64 { 329 index, match := hg.Bounds.LowerBound(0, &value) 330 // Since we causetstore the lower and upper bound together, if the index is an odd number, then it points to a upper bound. 331 if index%2 == 1 { 332 if match { 333 return float64(hg.Buckets[index/2].Repeat) 334 } 335 return hg.notNullCount() / float64(hg.NDV) 336 } 337 if match { 338 cmp := chunk.GetCompareFunc(hg.Tp) 339 if cmp(hg.Bounds.GetRow(index), 0, hg.Bounds.GetRow(index+1), 0) == 0 { 340 return float64(hg.Buckets[index/2].Repeat) 341 } 342 return hg.notNullCount() / float64(hg.NDV) 343 } 344 return 0 345 } 346 347 // greaterRowCount estimates the event count where the column greater than value. 348 func (hg *Histogram) greaterRowCount(value types.Causet) float64 { 349 gtCount := hg.notNullCount() - hg.lessRowCount(value) - hg.equalRowCount(value) 350 return math.Max(0, gtCount) 351 } 352 353 // LessRowCountWithBktIdx estimates the event count where the column less than value. 354 func (hg *Histogram) LessRowCountWithBktIdx(value types.Causet) (float64, int) { 355 // All the values are null. 356 if hg.Bounds.NumRows() == 0 { 357 return 0, 0 358 } 359 index, match := hg.Bounds.LowerBound(0, &value) 360 if index == hg.Bounds.NumRows() { 361 return hg.notNullCount(), hg.Len() - 1 362 } 363 // Since we causetstore the lower and upper bound together, so dividing the index by 2 will get the bucket index. 364 bucketIdx := index / 2 365 curCount, curRepeat := float64(hg.Buckets[bucketIdx].Count), float64(hg.Buckets[bucketIdx].Repeat) 366 preCount := float64(0) 367 if bucketIdx > 0 { 368 preCount = float64(hg.Buckets[bucketIdx-1].Count) 369 } 370 if index%2 == 1 { 371 if match { 372 return curCount - curRepeat, bucketIdx 373 } 374 return preCount + hg.calcFraction(bucketIdx, &value)*(curCount-curRepeat-preCount), bucketIdx 375 } 376 return preCount, bucketIdx 377 } 378 379 func (hg *Histogram) lessRowCount(value types.Causet) float64 { 380 result, _ := hg.LessRowCountWithBktIdx(value) 381 return result 382 } 383 384 // BetweenRowCount estimates the event count where column greater or equal to a and less than b. 385 func (hg *Histogram) BetweenRowCount(a, b types.Causet) float64 { 386 lessCountA := hg.lessRowCount(a) 387 lessCountB := hg.lessRowCount(b) 388 // If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate 389 // the fraction, so we use `totalCount / NDV` to estimate the event count, but the result should not greater than 390 // lessCountB or notNullCount-lessCountA. 391 if lessCountA >= lessCountB && hg.NDV > 0 { 392 result := math.Min(lessCountB, hg.notNullCount()-lessCountA) 393 return math.Min(result, hg.notNullCount()/float64(hg.NDV)) 394 } 395 return lessCountB - lessCountA 396 } 397 398 // TotalRowCount returns the total count of this histogram. 399 func (hg *Histogram) TotalRowCount() float64 { 400 return hg.notNullCount() + float64(hg.NullCount) 401 } 402 403 // notNullCount indicates the count of non-null values in column histogram and single-column index histogram, 404 // for multi-column index histogram, since we cannot define null for the event, we treat all rows as non-null, that means, 405 // notNullCount would return same value as TotalRowCount for multi-column index histograms. 406 func (hg *Histogram) notNullCount() float64 { 407 if hg.Len() == 0 { 408 return 0 409 } 410 return float64(hg.Buckets[hg.Len()-1].Count) 411 } 412 413 // mergeBuckets is used to Merge every two neighbor buckets. 414 func (hg *Histogram) mergeBuckets(bucketIdx int) { 415 curBuck := 0 416 c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp}, bucketIdx) 417 for i := 0; i+1 <= bucketIdx; i += 2 { 418 hg.Buckets[curBuck] = hg.Buckets[i+1] 419 c.AppendCauset(0, hg.GetLower(i)) 420 c.AppendCauset(0, hg.GetUpper(i+1)) 421 curBuck++ 422 } 423 if bucketIdx%2 == 0 { 424 hg.Buckets[curBuck] = hg.Buckets[bucketIdx] 425 c.AppendCauset(0, hg.GetLower(bucketIdx)) 426 c.AppendCauset(0, hg.GetUpper(bucketIdx)) 427 curBuck++ 428 } 429 hg.Bounds = c 430 hg.Buckets = hg.Buckets[:curBuck] 431 } 432 433 // GetIncreaseFactor will return a factor of data increasing after the last analysis. 434 func (hg *Histogram) GetIncreaseFactor(totalCount int64) float64 { 435 columnCount := hg.TotalRowCount() 436 if columnCount == 0 { 437 // avoid dividing by 0 438 return 1.0 439 } 440 return float64(totalCount) / columnCount 441 } 442 443 // validRange checks if the range is Valid, it is used by `SplitRange` to remove the invalid range, 444 // the possible types of range are index key range and handle key range. 445 func validRange(sc *stmtctx.StatementContext, ran *ranger.Range, encoded bool) bool { 446 var low, high []byte 447 if encoded { 448 low, high = ran.LowVal[0].GetBytes(), ran.HighVal[0].GetBytes() 449 } else { 450 var err error 451 low, err = codec.EncodeKey(sc, nil, ran.LowVal[0]) 452 if err != nil { 453 return false 454 } 455 high, err = codec.EncodeKey(sc, nil, ran.HighVal[0]) 456 if err != nil { 457 return false 458 } 459 } 460 if ran.LowExclude { 461 low = ekv.Key(low).PrefixNext() 462 } 463 if !ran.HighExclude { 464 high = ekv.Key(high).PrefixNext() 465 } 466 return bytes.Compare(low, high) < 0 467 } 468 469 func checkHoTT(vals []types.Causet, HoTT byte) bool { 470 if HoTT == types.HoTTString { 471 HoTT = types.HoTTBytes 472 } 473 for _, val := range vals { 474 valHoTT := val.HoTT() 475 if valHoTT == types.HoTTNull || valHoTT == types.HoTTMinNotNull || valHoTT == types.HoTTMaxValue { 476 continue 477 } 478 if valHoTT == types.HoTTString { 479 valHoTT = types.HoTTBytes 480 } 481 if valHoTT != HoTT { 482 return false 483 } 484 // Only check the first non-null value. 485 break 486 } 487 return true 488 } 489 490 func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool { 491 HoTT := hg.GetLower(0).HoTT() 492 for _, ran := range ranges { 493 if !checkHoTT(ran.LowVal, HoTT) || !checkHoTT(ran.HighVal, HoTT) { 494 return false 495 } 496 } 497 return true 498 } 499 500 // SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound 501 // as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)), 502 // [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound. 503 func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) { 504 if !hg.typeMatch(oldRanges) { 505 return oldRanges, false 506 } 507 // Treat the only buckets as (-inf, +inf), so we do not need split it. 508 if hg.Len() == 1 { 509 return oldRanges, true 510 } 511 ranges := make([]*ranger.Range, 0, len(oldRanges)) 512 for _, ran := range oldRanges { 513 ranges = append(ranges, ran.Clone()) 514 } 515 split := make([]*ranger.Range, 0, len(ranges)) 516 for len(ranges) > 0 { 517 // Find the first bound that greater than the LowVal. 518 idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0]) 519 // Treat last bucket's upper bound as +inf, so we do not need split any more. 520 if idx >= hg.Bounds.NumRows()-1 { 521 split = append(split, ranges...) 522 break 523 } 524 // Treat first buckets's lower bound as -inf, just increase it to the next lower bound. 525 if idx == 0 { 526 idx = 2 527 } 528 // Get the next lower bound. 529 if idx%2 == 1 { 530 idx++ 531 } 532 lowerBound := hg.Bounds.GetRow(idx) 533 var i int 534 // Find the first range that need to be split by the lower bound. 535 for ; i < len(ranges); i++ { 536 if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 { 537 break 538 } 539 } 540 split = append(split, ranges[:i]...) 541 ranges = ranges[i:] 542 if len(ranges) == 0 { 543 break 544 } 545 // Split according to the lower bound. 546 cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0]) 547 if cmp > 0 { 548 lower := lowerBound.GetCauset(0, hg.Tp) 549 newRange := &ranger.Range{ 550 LowExclude: ranges[0].LowExclude, 551 LowVal: []types.Causet{ranges[0].LowVal[0]}, 552 HighVal: []types.Causet{lower}, 553 HighExclude: true} 554 if validRange(sc, newRange, encoded) { 555 split = append(split, newRange) 556 } 557 ranges[0].LowVal[0] = lower 558 ranges[0].LowExclude = false 559 if !validRange(sc, ranges[0], encoded) { 560 ranges = ranges[1:] 561 } 562 } 563 } 564 return split, true 565 } 566 567 func (hg *Histogram) bucketCount(idx int) int64 { 568 if idx == 0 { 569 return hg.Buckets[0].Count 570 } 571 return hg.Buckets[idx].Count - hg.Buckets[idx-1].Count 572 } 573 574 // HistogramToProto converts Histogram to its protobuf representation. 575 // Note that when this is used, the lower/upper bound in the bucket must be BytesCauset. 576 func HistogramToProto(hg *Histogram) *fidelpb.Histogram { 577 protoHg := &fidelpb.Histogram{ 578 Ndv: hg.NDV, 579 } 580 for i := 0; i < hg.Len(); i++ { 581 bkt := &fidelpb.Bucket{ 582 Count: hg.Buckets[i].Count, 583 LowerBound: hg.GetLower(i).GetBytes(), 584 UpperBound: hg.GetUpper(i).GetBytes(), 585 Repeats: hg.Buckets[i].Repeat, 586 } 587 protoHg.Buckets = append(protoHg.Buckets, bkt) 588 } 589 return protoHg 590 } 591 592 // HistogramFromProto converts Histogram from its protobuf representation. 593 // Note that we will set BytesCauset for the lower/upper bound in the bucket, the decode will 594 // be after all histograms merged. 595 func HistogramFromProto(protoHg *fidelpb.Histogram) *Histogram { 596 tp := types.NewFieldType(allegrosql.TypeBlob) 597 hg := NewHistogram(0, protoHg.Ndv, 0, 0, tp, len(protoHg.Buckets), 0) 598 for _, bucket := range protoHg.Buckets { 599 lower, upper := types.NewBytesCauset(bucket.LowerBound), types.NewBytesCauset(bucket.UpperBound) 600 hg.AppendBucket(&lower, &upper, bucket.Count, bucket.Repeats) 601 } 602 return hg 603 } 604 605 func (hg *Histogram) popFirstBucket() { 606 hg.Buckets = hg.Buckets[1:] 607 c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp, hg.Tp}, hg.Bounds.NumRows()-2) 608 c.Append(hg.Bounds, 2, hg.Bounds.NumRows()) 609 hg.Bounds = c 610 } 611 612 // IsIndexHist checks whether current histogram is one for index. 613 func (hg *Histogram) IsIndexHist() bool { 614 return hg.Tp.Tp == allegrosql.TypeBlob 615 } 616 617 // MergeHistograms merges two histograms. 618 func MergeHistograms(sc *stmtctx.StatementContext, lh *Histogram, rh *Histogram, bucketSize int) (*Histogram, error) { 619 if lh.Len() == 0 { 620 return rh, nil 621 } 622 if rh.Len() == 0 { 623 return lh, nil 624 } 625 lh.NDV += rh.NDV 626 lLen := lh.Len() 627 cmp, err := lh.GetUpper(lLen-1).CompareCauset(sc, rh.GetLower(0)) 628 if err != nil { 629 return nil, errors.Trace(err) 630 } 631 offset := int64(0) 632 if cmp == 0 { 633 lh.NDV-- 634 lh.uFIDelateLastBucket(rh.GetUpper(0), lh.Buckets[lLen-1].Count+rh.Buckets[0].Count, rh.Buckets[0].Repeat) 635 offset = rh.Buckets[0].Count 636 rh.popFirstBucket() 637 } 638 for lh.Len() > bucketSize { 639 lh.mergeBuckets(lh.Len() - 1) 640 } 641 if rh.Len() == 0 { 642 return lh, nil 643 } 644 for rh.Len() > bucketSize { 645 rh.mergeBuckets(rh.Len() - 1) 646 } 647 lCount := lh.Buckets[lh.Len()-1].Count 648 rCount := rh.Buckets[rh.Len()-1].Count - offset 649 lAvg := float64(lCount) / float64(lh.Len()) 650 rAvg := float64(rCount) / float64(rh.Len()) 651 for lh.Len() > 1 && lAvg*2 <= rAvg { 652 lh.mergeBuckets(lh.Len() - 1) 653 lAvg *= 2 654 } 655 for rh.Len() > 1 && rAvg*2 <= lAvg { 656 rh.mergeBuckets(rh.Len() - 1) 657 rAvg *= 2 658 } 659 for i := 0; i < rh.Len(); i++ { 660 lh.AppendBucket(rh.GetLower(i), rh.GetUpper(i), rh.Buckets[i].Count+lCount-offset, rh.Buckets[i].Repeat) 661 } 662 for lh.Len() > bucketSize { 663 lh.mergeBuckets(lh.Len() - 1) 664 } 665 return lh, nil 666 } 667 668 // AvgCountPerNotNullValue gets the average event count per value by the data of histogram. 669 func (hg *Histogram) AvgCountPerNotNullValue(totalCount int64) float64 { 670 factor := hg.GetIncreaseFactor(totalCount) 671 totalNotNull := hg.notNullCount() * factor 672 curNDV := float64(hg.NDV) * factor 673 curNDV = math.Max(curNDV, 1) 674 return totalNotNull / curNDV 675 } 676 677 func (hg *Histogram) outOfRange(val types.Causet) bool { 678 if hg.Len() == 0 { 679 return true 680 } 681 return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 || 682 chunk.Compare(hg.Bounds.GetRow(hg.Bounds.NumRows()-1), 0, &val) < 0 683 } 684 685 // Copy deep copies the histogram. 686 func (hg *Histogram) Copy() *Histogram { 687 newHist := *hg 688 newHist.Bounds = hg.Bounds.CopyConstruct() 689 newHist.Buckets = make([]Bucket, 0, len(hg.Buckets)) 690 newHist.Buckets = append(newHist.Buckets, hg.Buckets...) 691 return &newHist 692 } 693 694 // RemoveUpperBound removes the upper bound from histogram. 695 // It is used when merge stats for incremental analyze. 696 func (hg *Histogram) RemoveUpperBound() *Histogram { 697 hg.Buckets[hg.Len()-1].Count -= hg.Buckets[hg.Len()-1].Repeat 698 hg.Buckets[hg.Len()-1].Repeat = 0 699 return hg 700 } 701 702 // TruncateHistogram truncates the histogram to `numBkt` buckets. 703 func (hg *Histogram) TruncateHistogram(numBkt int) *Histogram { 704 hist := hg.Copy() 705 hist.Buckets = hist.Buckets[:numBkt] 706 hist.Bounds.TruncateTo(numBkt * 2) 707 return hist 708 } 709 710 // ErrorRate is the error rate of estimate event count by bucket and cm sketch. 711 type ErrorRate struct { 712 ErrorTotal float64 713 QueryTotal int64 714 } 715 716 // MaxErrorRate is the max error rate of estimate event count of a not pseudo column. 717 // If the causet is pseudo, but the average error rate is less than MaxErrorRate, 718 // then the column is not pseudo. 719 const MaxErrorRate = 0.25 720 721 // NotAccurate is true when the total of query is zero or the average error 722 // rate is greater than MaxErrorRate. 723 func (e *ErrorRate) NotAccurate() bool { 724 if e.QueryTotal == 0 { 725 return true 726 } 727 return e.ErrorTotal/float64(e.QueryTotal) > MaxErrorRate 728 } 729 730 // UFIDelate uFIDelates the ErrorRate. 731 func (e *ErrorRate) UFIDelate(rate float64) { 732 e.QueryTotal++ 733 e.ErrorTotal += rate 734 } 735 736 // Merge range merges two ErrorRate. 737 func (e *ErrorRate) Merge(rate *ErrorRate) { 738 e.QueryTotal += rate.QueryTotal 739 e.ErrorTotal += rate.ErrorTotal 740 } 741 742 // DeferredCauset represents a column histogram. 743 type DeferredCauset struct { 744 Histogram 745 *CMSketch 746 PhysicalID int64 747 Count int64 748 Info *perceptron.DeferredCausetInfo 749 IsHandle bool 750 ErrorRate 751 Flag int64 752 LastAnalyzePos types.Causet 753 } 754 755 func (c *DeferredCauset) String() string { 756 return c.Histogram.ToString(0) 757 } 758 759 // MemoryUsage returns the total memory usage of Histogram and CMSketch in DeferredCauset. 760 // We ignore the size of other spacetimedata in DeferredCauset 761 func (c *DeferredCauset) MemoryUsage() (sum int64) { 762 sum = c.Histogram.MemoryUsage() 763 if c.CMSketch != nil { 764 sum += c.CMSketch.MemoryUsage() 765 } 766 return 767 } 768 769 // HistogramNeededDeferredCausets stores the columns whose Histograms need to be loaded from physical ekv layer. 770 // Currently, we only load index/pk's Histogram from ekv automatically. DeferredCausets' are loaded by needs. 771 var HistogramNeededDeferredCausets = neededDeferredCausetMap{defcaus: map[blockDeferredCausetID]struct{}{}} 772 773 // IsInvalid checks if this column is invalid. If this column has histogram but not loaded yet, then we mark it 774 // as need histogram. 775 func (c *DeferredCauset) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { 776 if collPseudo && c.NotAccurate() { 777 return true 778 } 779 if c.NDV > 0 && c.Len() == 0 && sc != nil { 780 sc.SetHistogramsNotLoad() 781 HistogramNeededDeferredCausets.insert(blockDeferredCausetID{TableID: c.PhysicalID, DeferredCausetID: c.Info.ID}) 782 } 783 return c.TotalRowCount() == 0 || (c.NDV > 0 && c.Len() == 0) 784 } 785 786 func (c *DeferredCauset) equalRowCount(sc *stmtctx.StatementContext, val types.Causet, modifyCount int64) (float64, error) { 787 if val.IsNull() { 788 return float64(c.NullCount), nil 789 } 790 // All the values are null. 791 if c.Histogram.Bounds.NumRows() == 0 { 792 return 0.0, nil 793 } 794 if c.NDV > 0 && c.outOfRange(val) { 795 return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil 796 } 797 if c.CMSketch != nil { 798 count, err := c.CMSketch.queryValue(sc, val) 799 return float64(count), errors.Trace(err) 800 } 801 return c.Histogram.equalRowCount(val), nil 802 } 803 804 // GetDeferredCausetRowCount estimates the event count by a slice of Range. 805 func (c *DeferredCauset) GetDeferredCausetRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64, pkIsHandle bool) (float64, error) { 806 var rowCount float64 807 for _, rg := range ranges { 808 highVal := *rg.HighVal[0].Clone() 809 lowVal := *rg.LowVal[0].Clone() 810 if highVal.HoTT() == types.HoTTString { 811 highVal.SetBytesAsString(collate.GetDefCauslator( 812 highVal.DefCauslation()).Key(highVal.GetString()), 813 highVal.DefCauslation(), 814 uint32(highVal.Length()), 815 ) 816 } 817 if lowVal.HoTT() == types.HoTTString { 818 lowVal.SetBytesAsString(collate.GetDefCauslator( 819 lowVal.DefCauslation()).Key(lowVal.GetString()), 820 lowVal.DefCauslation(), 821 uint32(lowVal.Length()), 822 ) 823 } 824 cmp, err := lowVal.CompareCauset(sc, &highVal) 825 if err != nil { 826 return 0, errors.Trace(err) 827 } 828 if cmp == 0 { 829 // the point case. 830 if !rg.LowExclude && !rg.HighExclude { 831 // In this case, the event count is at most 1. 832 if pkIsHandle { 833 rowCount += 1 834 continue 835 } 836 var cnt float64 837 cnt, err = c.equalRowCount(sc, lowVal, modifyCount) 838 if err != nil { 839 return 0, errors.Trace(err) 840 } 841 rowCount += cnt 842 } 843 continue 844 } 845 rangeVals := enumRangeValues(lowVal, highVal, rg.LowExclude, rg.HighExclude) 846 // The small range case. 847 if rangeVals != nil { 848 for _, val := range rangeVals { 849 cnt, err := c.equalRowCount(sc, val, modifyCount) 850 if err != nil { 851 return 0, err 852 } 853 rowCount += cnt 854 } 855 continue 856 } 857 // The interval case. 858 cnt := c.BetweenRowCount(lowVal, highVal) 859 if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { 860 cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount() 861 } 862 // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. 863 // Note that, `cnt` does not include null values, we need specially handle cases 864 // where null is the lower bound. 865 if rg.LowExclude && !lowVal.IsNull() { 866 lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount) 867 if err != nil { 868 return 0, errors.Trace(err) 869 } 870 cnt -= lowCnt 871 } 872 if !rg.LowExclude && lowVal.IsNull() { 873 cnt += float64(c.NullCount) 874 } 875 if !rg.HighExclude { 876 highCnt, err := c.equalRowCount(sc, highVal, modifyCount) 877 if err != nil { 878 return 0, errors.Trace(err) 879 } 880 cnt += highCnt 881 } 882 rowCount += cnt 883 } 884 if rowCount > c.TotalRowCount() { 885 rowCount = c.TotalRowCount() 886 } else if rowCount < 0 { 887 rowCount = 0 888 } 889 return rowCount, nil 890 } 891 892 // Index represents an index histogram. 893 type Index struct { 894 Histogram 895 *CMSketch 896 ErrorRate 897 StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility 898 Info *perceptron.IndexInfo 899 Flag int64 900 LastAnalyzePos types.Causet 901 } 902 903 func (idx *Index) String() string { 904 return idx.Histogram.ToString(len(idx.Info.DeferredCausets)) 905 } 906 907 // IsInvalid checks if this index is invalid. 908 func (idx *Index) IsInvalid(collPseudo bool) bool { 909 return (collPseudo && idx.NotAccurate()) || idx.TotalRowCount() == 0 910 } 911 912 // MemoryUsage returns the total memory usage of a Histogram and CMSketch in Index. 913 // We ignore the size of other spacetimedata in Index. 914 func (idx *Index) MemoryUsage() (sum int64) { 915 sum = idx.Histogram.MemoryUsage() 916 if idx.CMSketch != nil { 917 sum += idx.CMSketch.MemoryUsage() 918 } 919 return 920 } 921 922 var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewCauset(nil)) 923 924 func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) (float64, error) { 925 if len(idx.Info.DeferredCausets) == 1 { 926 if bytes.Equal(b, nullKeyBytes) { 927 return float64(idx.NullCount), nil 928 } 929 } 930 val := types.NewBytesCauset(b) 931 if idx.NDV > 0 && idx.outOfRange(val) { 932 return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil 933 } 934 if idx.CMSketch != nil { 935 return float64(idx.CMSketch.QueryBytes(b)), nil 936 } 937 return idx.Histogram.equalRowCount(val), nil 938 } 939 940 // GetRowCount returns the event count of the given ranges. 941 // It uses the modifyCount to adjust the influence of modifications on the causet. 942 func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { 943 totalCount := float64(0) 944 isSingleDefCaus := len(idx.Info.DeferredCausets) == 1 945 for _, indexRange := range indexRanges { 946 lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...) 947 if err != nil { 948 return 0, err 949 } 950 rb, err := codec.EncodeKey(sc, nil, indexRange.HighVal...) 951 if err != nil { 952 return 0, err 953 } 954 fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.DeferredCausets) 955 if bytes.Equal(lb, rb) { 956 if indexRange.LowExclude || indexRange.HighExclude { 957 continue 958 } 959 if fullLen { 960 // At most 1 in this case. 961 if idx.Info.Unique { 962 totalCount += 1 963 continue 964 } 965 count, err := idx.equalRowCount(sc, lb, modifyCount) 966 if err != nil { 967 return 0, err 968 } 969 totalCount += count 970 continue 971 } 972 } 973 if indexRange.LowExclude { 974 lb = ekv.Key(lb).PrefixNext() 975 } 976 if !indexRange.HighExclude { 977 rb = ekv.Key(rb).PrefixNext() 978 } 979 l := types.NewBytesCauset(lb) 980 r := types.NewBytesCauset(rb) 981 totalCount += idx.BetweenRowCount(l, r) 982 lowIsNull := bytes.Equal(lb, nullKeyBytes) 983 if (idx.outOfRange(l) && !(isSingleDefCaus && lowIsNull)) || idx.outOfRange(r) { 984 totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() 985 } 986 if isSingleDefCaus && lowIsNull { 987 totalCount += float64(idx.NullCount) 988 } 989 } 990 if totalCount > idx.TotalRowCount() { 991 totalCount = idx.TotalRowCount() 992 } 993 return totalCount, nil 994 } 995 996 type countByRangeFunc = func(*stmtctx.StatementContext, int64, []*ranger.Range) (float64, error) 997 998 // newHistogramBySelectivity fulfills the content of new histogram by the given selectivity result. 999 // TODO: Causet is not efficient, try to avoid using it here. 1000 // Also, there're redundant calculation with Selectivity(). We need to reduce it too. 1001 func newHistogramBySelectivity(sc *stmtctx.StatementContext, histID int64, oldHist, newHist *Histogram, ranges []*ranger.Range, cntByRangeFunc countByRangeFunc) error { 1002 cntPerVal := int64(oldHist.AvgCountPerNotNullValue(int64(oldHist.TotalRowCount()))) 1003 var totCnt int64 1004 for boundIdx, ranIdx, highRangeIdx := 0, 0, 0; boundIdx < oldHist.Bounds.NumRows() && ranIdx < len(ranges); boundIdx, ranIdx = boundIdx+2, highRangeIdx { 1005 for highRangeIdx < len(ranges) && chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx].HighVal[0]) >= 0 { 1006 highRangeIdx++ 1007 } 1008 if boundIdx+2 >= oldHist.Bounds.NumRows() && highRangeIdx < len(ranges) && ranges[highRangeIdx].HighVal[0].HoTT() == types.HoTTMaxValue { 1009 highRangeIdx++ 1010 } 1011 if ranIdx == highRangeIdx { 1012 continue 1013 } 1014 cnt, err := cntByRangeFunc(sc, histID, ranges[ranIdx:highRangeIdx]) 1015 // This should not happen. 1016 if err != nil { 1017 return err 1018 } 1019 if cnt == 0 { 1020 continue 1021 } 1022 if int64(cnt) > oldHist.bucketCount(boundIdx/2) { 1023 cnt = float64(oldHist.bucketCount(boundIdx / 2)) 1024 } 1025 newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx)) 1026 newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx + 1)) 1027 totCnt += int64(cnt) 1028 bkt := Bucket{Count: totCnt} 1029 if chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx-1].HighVal[0]) == 0 && !ranges[highRangeIdx-1].HighExclude { 1030 bkt.Repeat = cntPerVal 1031 } 1032 newHist.Buckets = append(newHist.Buckets, bkt) 1033 switch newHist.Tp.EvalType() { 1034 case types.ETString, types.ETDecimal, types.ETDatetime, types.ETTimestamp: 1035 newHist.scalars = append(newHist.scalars, oldHist.scalars[boundIdx/2]) 1036 } 1037 } 1038 return nil 1039 } 1040 1041 func (idx *Index) newIndexBySelectivity(sc *stmtctx.StatementContext, statsNode *StatsNode) (*Index, error) { 1042 var ( 1043 ranLowEncode, ranHighEncode []byte 1044 err error 1045 ) 1046 newIndexHist := &Index{Info: idx.Info, StatsVer: idx.StatsVer, CMSketch: idx.CMSketch} 1047 newIndexHist.Histogram = *NewHistogram(idx.ID, int64(float64(idx.NDV)*statsNode.Selectivity), 0, 0, types.NewFieldType(allegrosql.TypeBlob), chunk.InitialCapacity, 0) 1048 1049 lowBucketIdx, highBucketIdx := 0, 0 1050 var totCnt int64 1051 1052 // Bucket bound of index is encoded one, so we need to decode it if we want to calculate the fraction accurately. 1053 // TODO: enhance its calculation. 1054 // Now just remove the bucket that no range fell in. 1055 for _, ran := range statsNode.Ranges { 1056 lowBucketIdx = highBucketIdx 1057 ranLowEncode, ranHighEncode, err = ran.Encode(sc, ranLowEncode, ranHighEncode) 1058 if err != nil { 1059 return nil, err 1060 } 1061 for ; highBucketIdx < idx.Len(); highBucketIdx++ { 1062 // Encoded value can only go to its next quickly. So ranHighEncode is actually range.HighVal's PrefixNext value. 1063 // So the Bound should also go to its PrefixNext. 1064 bucketLowerEncoded := idx.Bounds.GetRow(highBucketIdx * 2).GetBytes(0) 1065 if bytes.Compare(ranHighEncode, ekv.Key(bucketLowerEncoded).PrefixNext()) < 0 { 1066 break 1067 } 1068 } 1069 for ; lowBucketIdx < highBucketIdx; lowBucketIdx++ { 1070 bucketUpperEncoded := idx.Bounds.GetRow(lowBucketIdx*2 + 1).GetBytes(0) 1071 if bytes.Compare(ranLowEncode, bucketUpperEncoded) <= 0 { 1072 break 1073 } 1074 } 1075 if lowBucketIdx >= idx.Len() { 1076 break 1077 } 1078 for i := lowBucketIdx; i < highBucketIdx; i++ { 1079 newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i * 2)) 1080 newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i*2 + 1)) 1081 totCnt += idx.bucketCount(i) 1082 newIndexHist.Buckets = append(newIndexHist.Buckets, Bucket{Repeat: idx.Buckets[i].Repeat, Count: totCnt}) 1083 newIndexHist.scalars = append(newIndexHist.scalars, idx.scalars[i]) 1084 } 1085 } 1086 return newIndexHist, nil 1087 } 1088 1089 // NewHistDefCauslBySelectivity creates new HistDefCausl by the given statsNodes. 1090 func (coll *HistDefCausl) NewHistDefCauslBySelectivity(sc *stmtctx.StatementContext, statsNodes []*StatsNode) *HistDefCausl { 1091 newDefCausl := &HistDefCausl{ 1092 DeferredCausets: make(map[int64]*DeferredCauset), 1093 Indices: make(map[int64]*Index), 1094 Idx2DeferredCausetIDs: coll.Idx2DeferredCausetIDs, 1095 DefCausID2IdxID: coll.DefCausID2IdxID, 1096 Count: coll.Count, 1097 } 1098 for _, node := range statsNodes { 1099 if node.Tp == IndexType { 1100 idxHist, ok := coll.Indices[node.ID] 1101 if !ok { 1102 continue 1103 } 1104 newIdxHist, err := idxHist.newIndexBySelectivity(sc, node) 1105 if err != nil { 1106 logutil.BgLogger().Warn("[Histogram-in-plan]: something wrong happened when calculating event count, "+ 1107 "failed to build histogram for index %v of causet %v", 1108 zap.String("index", idxHist.Info.Name.O), zap.String("causet", idxHist.Info.Block.O), zap.Error(err)) 1109 continue 1110 } 1111 newDefCausl.Indices[node.ID] = newIdxHist 1112 continue 1113 } 1114 oldDefCaus, ok := coll.DeferredCausets[node.ID] 1115 if !ok { 1116 continue 1117 } 1118 newDefCaus := &DeferredCauset{ 1119 PhysicalID: oldDefCaus.PhysicalID, 1120 Info: oldDefCaus.Info, 1121 IsHandle: oldDefCaus.IsHandle, 1122 CMSketch: oldDefCaus.CMSketch, 1123 } 1124 newDefCaus.Histogram = *NewHistogram(oldDefCaus.ID, int64(float64(oldDefCaus.NDV)*node.Selectivity), 0, 0, oldDefCaus.Tp, chunk.InitialCapacity, 0) 1125 var err error 1126 splitRanges, ok := oldDefCaus.Histogram.SplitRange(sc, node.Ranges, false) 1127 if !ok { 1128 logutil.BgLogger().Warn("[Histogram-in-plan]: the type of histogram and ranges mismatch") 1129 continue 1130 } 1131 // Deal with some corner case. 1132 if len(splitRanges) > 0 { 1133 // Deal with NULL values. 1134 if splitRanges[0].LowVal[0].IsNull() { 1135 newDefCaus.NullCount = oldDefCaus.NullCount 1136 if splitRanges[0].HighVal[0].IsNull() { 1137 splitRanges = splitRanges[1:] 1138 } else { 1139 splitRanges[0].LowVal[0].SetMinNotNull() 1140 } 1141 } 1142 } 1143 if oldDefCaus.IsHandle { 1144 err = newHistogramBySelectivity(sc, node.ID, &oldDefCaus.Histogram, &newDefCaus.Histogram, splitRanges, coll.GetRowCountByIntDeferredCausetRanges) 1145 } else { 1146 err = newHistogramBySelectivity(sc, node.ID, &oldDefCaus.Histogram, &newDefCaus.Histogram, splitRanges, coll.GetRowCountByDeferredCausetRanges) 1147 } 1148 if err != nil { 1149 logutil.BgLogger().Warn("[Histogram-in-plan]: something wrong happened when calculating event count", 1150 zap.Error(err)) 1151 continue 1152 } 1153 newDefCausl.DeferredCausets[node.ID] = newDefCaus 1154 } 1155 for id, idx := range coll.Indices { 1156 _, ok := newDefCausl.Indices[id] 1157 if !ok { 1158 newDefCausl.Indices[id] = idx 1159 } 1160 } 1161 for id, col := range coll.DeferredCausets { 1162 _, ok := newDefCausl.DeferredCausets[id] 1163 if !ok { 1164 newDefCausl.DeferredCausets[id] = col 1165 } 1166 } 1167 return newDefCausl 1168 } 1169 1170 func (idx *Index) outOfRange(val types.Causet) bool { 1171 if idx.Histogram.Len() == 0 { 1172 return true 1173 } 1174 withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 || 1175 matchPrefix(idx.Bounds.GetRow(0), 0, &val) 1176 withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0 1177 return !withInLowBoundOrPrefixMatch || !withInHighBound 1178 } 1179 1180 // matchPrefix checks whether ad is the prefix of value 1181 func matchPrefix(event chunk.Row, colIdx int, ad *types.Causet) bool { 1182 switch ad.HoTT() { 1183 case types.HoTTString, types.HoTTBytes, types.HoTTBinaryLiteral, types.HoTTMysqlBit: 1184 return strings.HasPrefix(event.GetString(colIdx), ad.GetString()) 1185 } 1186 return false 1187 } 1188 1189 type dataCnt struct { 1190 data []byte 1191 cnt uint64 1192 } 1193 1194 func getIndexPrefixLens(data []byte, numDefCauss int) (prefixLens []int, err error) { 1195 prefixLens = make([]int, 0, numDefCauss) 1196 var colData []byte 1197 prefixLen := 0 1198 for len(data) > 0 { 1199 colData, data, err = codec.CutOne(data) 1200 if err != nil { 1201 return nil, err 1202 } 1203 prefixLen += len(colData) 1204 prefixLens = append(prefixLens, prefixLen) 1205 } 1206 return prefixLens, nil 1207 } 1208 1209 // ExtractTopN extracts topn from histogram. 1210 func (hg *Histogram) ExtractTopN(cms *CMSketch, numDefCauss int, numTopN uint32) error { 1211 if hg.Len() == 0 || cms == nil || numTopN == 0 { 1212 return nil 1213 } 1214 dataSet := make(map[string]struct{}, hg.Bounds.NumRows()) 1215 dataCnts := make([]dataCnt, 0, hg.Bounds.NumRows()) 1216 hg.PreCalculateScalar() 1217 // Set a limit on the frequency of boundary values to avoid extract values with low frequency. 1218 limit := hg.notNullCount() / float64(hg.Len()) 1219 // Since our histogram are equal depth, they must occurs on the boundaries of buckets. 1220 for i := 0; i < hg.Bounds.NumRows(); i++ { 1221 data := hg.Bounds.GetRow(i).GetBytes(0) 1222 prefixLens, err := getIndexPrefixLens(data, numDefCauss) 1223 if err != nil { 1224 return err 1225 } 1226 for _, prefixLen := range prefixLens { 1227 prefixDefCausData := data[:prefixLen] 1228 _, ok := dataSet[string(prefixDefCausData)] 1229 if ok { 1230 continue 1231 } 1232 dataSet[string(prefixDefCausData)] = struct{}{} 1233 res := hg.BetweenRowCount(types.NewBytesCauset(prefixDefCausData), types.NewBytesCauset(ekv.Key(prefixDefCausData).PrefixNext())) 1234 if res >= limit { 1235 dataCnts = append(dataCnts, dataCnt{prefixDefCausData, uint64(res)}) 1236 } 1237 } 1238 } 1239 sort.SliceSblock(dataCnts, func(i, j int) bool { return dataCnts[i].cnt >= dataCnts[j].cnt }) 1240 if len(dataCnts) > int(numTopN) { 1241 dataCnts = dataCnts[:numTopN] 1242 } 1243 cms.topN = make(map[uint64][]*TopNMeta, len(dataCnts)) 1244 for _, dataCnt := range dataCnts { 1245 h1, h2 := murmur3.Sum128(dataCnt.data) 1246 realCnt := cms.queryHashValue(h1, h2) 1247 cms.subValue(h1, h2, realCnt) 1248 cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, dataCnt.data, realCnt}) 1249 } 1250 return nil 1251 }