github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/cmsketch.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "bytes" 18 "math" 19 "reflect" 20 "sort" 21 22 "github.com/cznic/mathutil" 23 "github.com/cznic/sortutil" 24 "github.com/whtcorpsinc/errors" 25 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 26 "github.com/whtcorpsinc/milevadb/blockcodec" 27 "github.com/whtcorpsinc/milevadb/types" 28 "github.com/whtcorpsinc/milevadb/soliton/chunk" 29 "github.com/whtcorpsinc/milevadb/soliton/replog" 30 "github.com/whtcorpsinc/fidelpb/go-fidelpb" 31 "github.com/twmb/murmur3" 32 ) 33 34 // topNThreshold is the minimum ratio of the number of topn elements in CMSketch, 10 means 1 / 10 = 10%. 35 const topNThreshold = uint64(10) 36 37 // CMSketch is used to estimate point queries. 38 // Refer: https://en.wikipedia.org/wiki/Count-min_sketch 39 type CMSketch struct { 40 depth int32 41 width int32 42 count uint64 // TopN is not counted in count 43 defaultValue uint64 // In sampled data, if cmsketch returns a small value (less than avg value / 2), then this will returned. 44 causet [][]uint32 45 topN map[uint64][]*TopNMeta 46 } 47 48 // TopNMeta is a simple counter used by BuildTopN. 49 type TopNMeta struct { 50 h2 uint64 // h2 is the second part of `murmur3.Sum128()`, it is always used with the first part `h1`. 51 Data []byte 52 Count uint64 53 } 54 55 // GetH2 get the the second part of `murmur3.Sum128()`, just for test. 56 func (t *TopNMeta) GetH2() uint64 { 57 return t.h2 58 } 59 60 // NewCMSketch returns a new CM sketch. 61 func NewCMSketch(d, w int32) *CMSketch { 62 tbl := make([][]uint32, d) 63 // Background: The Go's memory allocator will ask caller to sweep spans in some scenarios. 64 // This can cause memory allocation request latency unpredicblock, if the list of spans which need sweep is too long. 65 // For memory allocation large than 32K, the allocator will never allocate memory from spans list. 66 // 67 // The memory referenced by the CMSketch will never be freed. 68 // If the number of causet or index is extremely large, there will be a large amount of spans in global list. 69 // The default value of `d` is 5 and `w` is 2048, if we use a single slice for them the size will be 40K. 70 // This allocation will be handled by mheap and will never have impact on normal allocations. 71 memcam := make([]uint32, d*w) 72 for i := range tbl { 73 tbl[i] = memcam[i*int(w) : (i+1)*int(w)] 74 } 75 return &CMSketch{depth: d, width: w, causet: tbl} 76 } 77 78 // topNHelper wraps some variables used when building cmsketch with top n. 79 type topNHelper struct { 80 sampleSize uint64 81 sorted []dataCnt 82 onlyOnceItems uint64 83 sumTopN uint64 84 actualNumTop uint32 85 } 86 87 func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { 88 counter := make(map[replog.MublockString]uint64, len(sample)) 89 for i := range sample { 90 counter[replog.String(sample[i])]++ 91 } 92 sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0) 93 for key, cnt := range counter { 94 sorted = append(sorted, dataCnt{replog.Slice(string(key)), cnt}) 95 if cnt == 1 { 96 onlyOnceItems++ 97 } 98 } 99 sort.SliceSblock(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt }) 100 101 var ( 102 sumTopN uint64 103 sampleNDV = uint32(len(sorted)) 104 ) 105 numTop = mathutil.MinUint32(sampleNDV, numTop) // Ensure numTop no larger than sampNDV. 106 // Only element whose frequency is not smaller than 2/3 multiples the 107 // frequency of the n-th element are added to the TopN statistics. We chose 108 // 2/3 as an empirical value because the average cardinality estimation 109 // error is relatively small compared with 1/2. 110 var actualNumTop uint32 111 for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ { 112 if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 { 113 break 114 } 115 if sorted[actualNumTop].cnt == 1 { 116 break 117 } 118 sumTopN += sorted[actualNumTop].cnt 119 } 120 121 return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop} 122 } 123 124 // NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio. 125 func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount uint64) (*CMSketch, uint64, uint64) { 126 if rowCount == 0 || len(sample) == 0 { 127 return nil, 0, 0 128 } 129 helper := newTopNHelper(sample, numTop) 130 // rowCount is not a accurate value when fast analyzing 131 // In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen. 132 rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample))) 133 estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount) 134 defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount) 135 c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal) 136 return c, estimateNDV, scaleRatio 137 } 138 139 func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) { 140 c = NewCMSketch(d, w) 141 enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN 142 if enableTopN { 143 c.topN = make(map[uint64][]*TopNMeta, helper.actualNumTop) 144 for i := uint32(0); i < helper.actualNumTop; i++ { 145 data, cnt := helper.sorted[i].data, helper.sorted[i].cnt 146 h1, h2 := murmur3.Sum128(data) 147 c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio}) 148 } 149 helper.sorted = helper.sorted[helper.actualNumTop:] 150 } 151 c.defaultValue = defaultVal 152 for i := range helper.sorted { 153 data, cnt := helper.sorted[i].data, helper.sorted[i].cnt 154 // If the value only occurred once in the sample, we assumes that there is no difference with 155 // value that does not occurred in the sample. 156 rowCount := defaultVal 157 if cnt > 1 { 158 rowCount = cnt * scaleRatio 159 } 160 c.insertBytesByCount(data, rowCount) 161 } 162 return 163 } 164 165 func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 { 166 sampleNDV := uint64(len(helper.sorted)) 167 if rowCount <= (helper.sampleSize-helper.onlyOnceItems)*scaleRatio { 168 return 1 169 } 170 estimateRemainingCount := rowCount - (helper.sampleSize-helper.onlyOnceItems)*scaleRatio 171 return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-sampleNDV+helper.onlyOnceItems) 172 } 173 174 func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta { 175 for _, spacetime := range c.topN[h1] { 176 if spacetime.h2 == h2 && bytes.Equal(d, spacetime.Data) { 177 return spacetime 178 } 179 } 180 return nil 181 } 182 183 // MemoryUsage returns the total memory usage of a CMSketch. 184 // only calc the hashblock size(CMSketch.causet) and the CMSketch.topN 185 // data are not tracked because size of CMSketch.topN take little influence 186 // We ignore the size of other spacetimedata in CMSketch. 187 func (c *CMSketch) MemoryUsage() (sum int64) { 188 sum = int64(c.depth * c.width * 4) 189 return 190 } 191 192 // queryAddTopN TopN adds count to CMSketch.topN if exists, and returns the count of such elements after insert. 193 // If such elements does not in topn elements, nothing will happen and false will be returned. 194 func (c *CMSketch) uFIDelateTopNWithDelta(h1, h2 uint64, d []byte, delta uint64) bool { 195 if c.topN == nil { 196 return false 197 } 198 spacetime := c.findTopNMeta(h1, h2, d) 199 if spacetime != nil { 200 spacetime.Count += delta 201 return true 202 } 203 return false 204 } 205 206 // QueryTopN returns the results for (h1, h2) in murmur3.Sum128(), if not exists, return (0, false). 207 func (c *CMSketch) QueryTopN(h1, h2 uint64, d []byte) (uint64, bool) { 208 if c.topN == nil { 209 return 0, false 210 } 211 spacetime := c.findTopNMeta(h1, h2, d) 212 if spacetime != nil { 213 return spacetime.Count, true 214 } 215 return 0, false 216 } 217 218 // InsertBytes inserts the bytes value into the CM Sketch. 219 func (c *CMSketch) InsertBytes(bytes []byte) { 220 c.insertBytesByCount(bytes, 1) 221 } 222 223 // insertBytesByCount adds the bytes value into the TopN (if value already in TopN) or CM Sketch by delta, this does not uFIDelates c.defaultValue. 224 func (c *CMSketch) insertBytesByCount(bytes []byte, count uint64) { 225 h1, h2 := murmur3.Sum128(bytes) 226 if c.uFIDelateTopNWithDelta(h1, h2, bytes, count) { 227 return 228 } 229 c.count += count 230 for i := range c.causet { 231 j := (h1 + h2*uint64(i)) % uint64(c.width) 232 c.causet[i][j] += uint32(count) 233 } 234 } 235 236 func (c *CMSketch) considerDefVal(cnt uint64) bool { 237 return (cnt == 0 || (cnt > c.defaultValue && cnt < 2*(c.count/uint64(c.width)))) && c.defaultValue > 0 238 } 239 240 // uFIDelateValueBytes uFIDelates value of d to count. 241 func (c *CMSketch) uFIDelateValueBytes(d []byte, count uint64) { 242 h1, h2 := murmur3.Sum128(d) 243 if oriCount, ok := c.QueryTopN(h1, h2, d); ok { 244 deltaCount := count - oriCount 245 c.uFIDelateTopNWithDelta(h1, h2, d, deltaCount) 246 } 247 c.setValue(h1, h2, count) 248 } 249 250 // setValue sets the count for value that hashed into (h1, h2), and uFIDelate defaultValue if necessary. 251 func (c *CMSketch) setValue(h1, h2 uint64, count uint64) { 252 oriCount := c.queryHashValue(h1, h2) 253 if c.considerDefVal(oriCount) { 254 // We should uFIDelate c.defaultValue if we used c.defaultValue when getting the estimate count. 255 // This should make estimation better, remove this line if it does not work as expected. 256 c.defaultValue = uint64(float64(c.defaultValue)*0.95 + float64(c.defaultValue)*0.05) 257 if c.defaultValue == 0 { 258 // c.defaultValue never guess 0 since we are using a sampled data. 259 c.defaultValue = 1 260 } 261 } 262 263 c.count += count - oriCount 264 // let it overflow naturally 265 deltaCount := uint32(count) - uint32(oriCount) 266 for i := range c.causet { 267 j := (h1 + h2*uint64(i)) % uint64(c.width) 268 c.causet[i][j] = c.causet[i][j] + deltaCount 269 } 270 } 271 272 func (c *CMSketch) subValue(h1, h2 uint64, count uint64) { 273 c.count -= count 274 for i := range c.causet { 275 j := (h1 + h2*uint64(i)) % uint64(c.width) 276 c.causet[i][j] = c.causet[i][j] - uint32(count) 277 } 278 } 279 280 func (c *CMSketch) queryValue(sc *stmtctx.StatementContext, val types.Causet) (uint64, error) { 281 bytes, err := blockcodec.EncodeValue(sc, nil, val) 282 if err != nil { 283 return 0, errors.Trace(err) 284 } 285 return c.QueryBytes(bytes), nil 286 } 287 288 // QueryBytes is used to query the count of specified bytes. 289 func (c *CMSketch) QueryBytes(d []byte) uint64 { 290 h1, h2 := murmur3.Sum128(d) 291 if count, ok := c.QueryTopN(h1, h2, d); ok { 292 return count 293 } 294 return c.queryHashValue(h1, h2) 295 } 296 297 func (c *CMSketch) queryHashValue(h1, h2 uint64) uint64 { 298 vals := make([]uint32, c.depth) 299 min := uint32(math.MaxUint32) 300 // We want that when res is 0 before the noise is eliminated, the default value is not used. 301 // So we need a temp value to distinguish before and after eliminating noise. 302 temp := uint32(1) 303 for i := range c.causet { 304 j := (h1 + h2*uint64(i)) % uint64(c.width) 305 if min > c.causet[i][j] { 306 min = c.causet[i][j] 307 } 308 noise := (c.count - uint64(c.causet[i][j])) / (uint64(c.width) - 1) 309 if uint64(c.causet[i][j]) == 0 { 310 vals[i] = 0 311 } else if uint64(c.causet[i][j]) < noise { 312 vals[i] = temp 313 } else { 314 vals[i] = c.causet[i][j] - uint32(noise) + temp 315 } 316 } 317 sort.Sort(sortutil.Uint32Slice(vals)) 318 res := vals[(c.depth-1)/2] + (vals[c.depth/2]-vals[(c.depth-1)/2])/2 319 if res > min+temp { 320 res = min + temp 321 } 322 if res == 0 { 323 return uint64(0) 324 } 325 res = res - temp 326 if c.considerDefVal(uint64(res)) { 327 return c.defaultValue 328 } 329 return uint64(res) 330 } 331 332 func (c *CMSketch) mergeTopN(lTopN map[uint64][]*TopNMeta, rTopN map[uint64][]*TopNMeta, numTop uint32, usingMax bool) { 333 counter := make(map[replog.MublockString]uint64) 334 for _, spacetimes := range lTopN { 335 for _, spacetime := range spacetimes { 336 counter[replog.String(spacetime.Data)] += spacetime.Count 337 } 338 } 339 for _, spacetimes := range rTopN { 340 for _, spacetime := range spacetimes { 341 if usingMax { 342 counter[replog.String(spacetime.Data)] = mathutil.MaxUint64(counter[replog.String(spacetime.Data)], spacetime.Count) 343 } else { 344 counter[replog.String(spacetime.Data)] += spacetime.Count 345 } 346 } 347 } 348 sorted := make([]uint64, len(counter)) 349 for _, cnt := range counter { 350 sorted = append(sorted, cnt) 351 } 352 sort.Slice(sorted, func(i, j int) bool { 353 return sorted[i] > sorted[j] 354 }) 355 numTop = mathutil.MinUint32(uint32(len(counter)), numTop) 356 lastTopCnt := sorted[numTop-1] 357 c.topN = make(map[uint64][]*TopNMeta) 358 for value, cnt := range counter { 359 data := replog.Slice(string(value)) 360 if cnt >= lastTopCnt { 361 h1, h2 := murmur3.Sum128(data) 362 c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt}) 363 } else { 364 c.insertBytesByCount(data, cnt) 365 } 366 } 367 } 368 369 // MergeCMSketch merges two CM Sketch. 370 func (c *CMSketch) MergeCMSketch(rc *CMSketch, numTopN uint32) error { 371 if c == nil || rc == nil { 372 return nil 373 } 374 if c.depth != rc.depth || c.width != rc.width { 375 return errors.New("Dimensions of Count-Min Sketch should be the same") 376 } 377 if len(c.topN) > 0 || len(rc.topN) > 0 { 378 c.mergeTopN(c.topN, rc.topN, numTopN, false) 379 } 380 c.count += rc.count 381 for i := range c.causet { 382 for j := range c.causet[i] { 383 c.causet[i][j] += rc.causet[i][j] 384 } 385 } 386 return nil 387 } 388 389 // MergeCMSketch4IncrementalAnalyze merges two CM Sketch for incremental analyze. Since there is no value 390 // that appears partially in `c` and `rc` for incremental analyze, it uses `max` to merge them. 391 // Here is a simple proof: when we query from the CM sketch, we use the `min` to get the answer: 392 // (1): For values that only appears in `c, using `max` to merge them affects the `min` query result less than using `sum`; 393 // (2): For values that only appears in `rc`, it is the same as condition (1); 394 // (3): For values that appears both in `c` and `rc`, if they do not appear partially in `c` and `rc`, for example, 395 // if `v` appears 5 times in the causet, it can appears 5 times in `c` and 3 times in `rc`, then `max` also gives the correct answer. 396 // So in fact, if we can know the number of appearances of each value in the first place, it is better to use `max` to construct the CM sketch rather than `sum`. 397 func (c *CMSketch) MergeCMSketch4IncrementalAnalyze(rc *CMSketch, numTopN uint32) error { 398 if c.depth != rc.depth || c.width != rc.width { 399 return errors.New("Dimensions of Count-Min Sketch should be the same") 400 } 401 if len(c.topN) > 0 || len(rc.topN) > 0 { 402 c.mergeTopN(c.topN, rc.topN, numTopN, true) 403 } 404 for i := range c.causet { 405 c.count = 0 406 for j := range c.causet[i] { 407 c.causet[i][j] = mathutil.MaxUint32(c.causet[i][j], rc.causet[i][j]) 408 c.count += uint64(c.causet[i][j]) 409 } 410 } 411 return nil 412 } 413 414 // CMSketchToProto converts CMSketch to its protobuf representation. 415 func CMSketchToProto(c *CMSketch) *fidelpb.CMSketch { 416 protoSketch := &fidelpb.CMSketch{Rows: make([]*fidelpb.CMSketchRow, c.depth)} 417 for i := range c.causet { 418 protoSketch.Rows[i] = &fidelpb.CMSketchRow{Counters: make([]uint32, c.width)} 419 for j := range c.causet[i] { 420 protoSketch.Rows[i].Counters[j] = c.causet[i][j] 421 } 422 } 423 for _, dataSlice := range c.topN { 424 for _, dataMeta := range dataSlice { 425 protoSketch.TopN = append(protoSketch.TopN, &fidelpb.CMSketchTopN{Data: dataMeta.Data, Count: dataMeta.Count}) 426 } 427 } 428 protoSketch.DefaultValue = c.defaultValue 429 return protoSketch 430 } 431 432 // CMSketchFromProto converts CMSketch from its protobuf representation. 433 func CMSketchFromProto(protoSketch *fidelpb.CMSketch) *CMSketch { 434 if protoSketch == nil || len(protoSketch.Rows) == 0 { 435 return nil 436 } 437 c := NewCMSketch(int32(len(protoSketch.Rows)), int32(len(protoSketch.Rows[0].Counters))) 438 for i, event := range protoSketch.Rows { 439 c.count = 0 440 for j, counter := range event.Counters { 441 c.causet[i][j] = counter 442 c.count = c.count + uint64(counter) 443 } 444 } 445 c.defaultValue = protoSketch.DefaultValue 446 if len(protoSketch.TopN) == 0 { 447 return c 448 } 449 c.topN = make(map[uint64][]*TopNMeta, len(protoSketch.TopN)) 450 for _, e := range protoSketch.TopN { 451 h1, h2 := murmur3.Sum128(e.Data) 452 c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, e.Data, e.Count}) 453 } 454 return c 455 } 456 457 // EncodeCMSketchWithoutTopN encodes the given CMSketch to byte slice. 458 // Note that it does not include the topN. 459 func EncodeCMSketchWithoutTopN(c *CMSketch) ([]byte, error) { 460 if c == nil { 461 return nil, nil 462 } 463 p := CMSketchToProto(c) 464 p.TopN = nil 465 protoData, err := p.Marshal() 466 return protoData, err 467 } 468 469 // DecodeCMSketch decode a CMSketch from the given byte slice. 470 func DecodeCMSketch(data []byte, topNRows []chunk.Row) (*CMSketch, error) { 471 if data == nil { 472 return nil, nil 473 } 474 p := &fidelpb.CMSketch{} 475 err := p.Unmarshal(data) 476 if err != nil { 477 return nil, errors.Trace(err) 478 } 479 for _, event := range topNRows { 480 data := make([]byte, len(event.GetBytes(0))) 481 copy(data, event.GetBytes(0)) 482 p.TopN = append(p.TopN, &fidelpb.CMSketchTopN{Data: data, Count: event.GetUint64(1)}) 483 } 484 return CMSketchFromProto(p), nil 485 } 486 487 // TotalCount returns the total count in the sketch, it is only used for test. 488 func (c *CMSketch) TotalCount() uint64 { 489 res := c.count 490 for _, spacetimes := range c.topN { 491 for _, spacetime := range spacetimes { 492 res += spacetime.Count 493 } 494 } 495 return res 496 } 497 498 // Equal tests if two CM Sketch equal, it is only used for test. 499 func (c *CMSketch) Equal(rc *CMSketch) bool { 500 return reflect.DeepEqual(c, rc) 501 } 502 503 // Copy makes a copy for current CMSketch. 504 func (c *CMSketch) Copy() *CMSketch { 505 if c == nil { 506 return nil 507 } 508 tbl := make([][]uint32, c.depth) 509 for i := range tbl { 510 tbl[i] = make([]uint32, c.width) 511 copy(tbl[i], c.causet[i]) 512 } 513 var topN map[uint64][]*TopNMeta 514 if c.topN != nil { 515 topN = make(map[uint64][]*TopNMeta, len(c.topN)) 516 for h1, vals := range c.topN { 517 newVals := make([]*TopNMeta, 0, len(vals)) 518 for _, val := range vals { 519 newVal := TopNMeta{h2: val.h2, Count: val.Count, Data: make([]byte, len(val.Data))} 520 copy(newVal.Data, val.Data) 521 newVals = append(newVals, &newVal) 522 } 523 topN[h1] = newVals 524 } 525 } 526 return &CMSketch{count: c.count, width: c.width, depth: c.depth, causet: tbl, defaultValue: c.defaultValue, topN: topN} 527 } 528 529 // TopN gets all the topN spacetime. 530 func (c *CMSketch) TopN() []*TopNMeta { 531 if c == nil { 532 return nil 533 } 534 topN := make([]*TopNMeta, 0, len(c.topN)) 535 for _, spacetime := range c.topN { 536 topN = append(topN, spacetime...) 537 } 538 return topN 539 } 540 541 // TopNMap gets the origin topN map. 542 func (c *CMSketch) TopNMap() map[uint64][]*TopNMeta { 543 return c.topN 544 } 545 546 // AppendTopN appends a topn into the cm sketch. 547 func (c *CMSketch) AppendTopN(data []byte, count uint64) { 548 if c.topN == nil { 549 c.topN = make(map[uint64][]*TopNMeta) 550 } 551 h1, h2 := murmur3.Sum128(data) 552 c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, count}) 553 } 554 555 // GetWidthAndDepth returns the width and depth of CM Sketch. 556 func (c *CMSketch) GetWidthAndDepth() (int32, int32) { 557 return c.width, c.depth 558 } 559 560 // CalcDefaultValForAnalyze calculate the default value for Analyze. 561 // The value of it is count / NDV in CMSketch. This means count and NDV are not include topN. 562 func (c *CMSketch) CalcDefaultValForAnalyze(NDV uint64) { 563 // If NDV <= TopN, all values should be in TopN. 564 // So we set c.defaultValue to 0 and return immediately. 565 if NDV <= uint64(len(c.topN)) { 566 c.defaultValue = 0 567 return 568 } 569 remainNDV := NDV - uint64(len(c.topN)) 570 c.defaultValue = c.count / mathutil.MaxUint64(1, remainNDV) 571 }