github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/interlock/aggfuncs/func_count_distinct.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package aggfuncs 15 16 import ( 17 "encoding/binary" 18 "math" 19 "unsafe" 20 21 "github.com/dgryski/go-farm" 22 "github.com/whtcorpsinc/errors" 23 "github.com/whtcorpsinc/milevadb/memex" 24 "github.com/whtcorpsinc/milevadb/stochastikctx" 25 "github.com/whtcorpsinc/milevadb/types" 26 "github.com/whtcorpsinc/milevadb/types/json" 27 "github.com/whtcorpsinc/milevadb/soliton/chunk" 28 "github.com/whtcorpsinc/milevadb/soliton/codec" 29 "github.com/whtcorpsinc/milevadb/soliton/defCauslate" 30 "github.com/whtcorpsinc/milevadb/soliton/replog" 31 "github.com/whtcorpsinc/milevadb/soliton/set" 32 "github.com/whtcorpsinc/milevadb/soliton/stringutil" 33 ) 34 35 const ( 36 // DefPartialResult4CountDistinctIntSize is the size of partialResult4CountDistinctInt 37 DefPartialResult4CountDistinctIntSize = int64(unsafe.Sizeof(partialResult4CountDistinctInt{})) 38 // DefPartialResult4CountDistinctRealSize is the size of partialResult4CountDistinctReal 39 DefPartialResult4CountDistinctRealSize = int64(unsafe.Sizeof(partialResult4CountDistinctReal{})) 40 // DefPartialResult4CountDistinctDecimalSize is the size of partialResult4CountDistinctDecimal 41 DefPartialResult4CountDistinctDecimalSize = int64(unsafe.Sizeof(partialResult4CountDistinctDecimal{})) 42 // DefPartialResult4CountDistinctDurationSize is the size of partialResult4CountDistinctDuration 43 DefPartialResult4CountDistinctDurationSize = int64(unsafe.Sizeof(partialResult4CountDistinctDuration{})) 44 // DefPartialResult4CountDistinctStringSize is the size of partialResult4CountDistinctString 45 DefPartialResult4CountDistinctStringSize = int64(unsafe.Sizeof(partialResult4CountDistinctString{})) 46 // DefPartialResult4CountWithDistinctSize is the size of partialResult4CountWithDistinct 47 DefPartialResult4CountWithDistinctSize = int64(unsafe.Sizeof(partialResult4CountWithDistinct{})) 48 // DefPartialResult4ApproxCountDistinctSize is the size of partialResult4ApproxCountDistinct 49 DefPartialResult4ApproxCountDistinctSize = int64(unsafe.Sizeof(partialResult4ApproxCountDistinct{})) 50 ) 51 52 type partialResult4CountDistinctInt struct { 53 valSet set.Int64Set 54 } 55 56 type countOriginalWithDistinct4Int struct { 57 baseCount 58 } 59 60 func (e *countOriginalWithDistinct4Int) AllocPartialResult() (pr PartialResult, memDelta int64) { 61 return PartialResult(&partialResult4CountDistinctInt{ 62 valSet: set.NewInt64Set(), 63 }), DefPartialResult4CountDistinctIntSize 64 } 65 66 func (e *countOriginalWithDistinct4Int) ResetPartialResult(pr PartialResult) { 67 p := (*partialResult4CountDistinctInt)(pr) 68 p.valSet = set.NewInt64Set() 69 } 70 71 func (e *countOriginalWithDistinct4Int) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 72 p := (*partialResult4CountDistinctInt)(pr) 73 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 74 return nil 75 } 76 77 func (e *countOriginalWithDistinct4Int) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 78 p := (*partialResult4CountDistinctInt)(pr) 79 80 for _, event := range rowsInGroup { 81 input, isNull, err := e.args[0].EvalInt(sctx, event) 82 if err != nil { 83 return memDelta, err 84 } 85 if isNull { 86 continue 87 } 88 if p.valSet.Exist(input) { 89 continue 90 } 91 p.valSet.Insert(input) 92 memDelta += DefInt64Size 93 } 94 95 return memDelta, nil 96 } 97 98 type partialResult4CountDistinctReal struct { 99 valSet set.Float64Set 100 } 101 102 type countOriginalWithDistinct4Real struct { 103 baseCount 104 } 105 106 func (e *countOriginalWithDistinct4Real) AllocPartialResult() (pr PartialResult, memDelta int64) { 107 return PartialResult(&partialResult4CountDistinctReal{ 108 valSet: set.NewFloat64Set(), 109 }), DefPartialResult4CountDistinctRealSize 110 } 111 112 func (e *countOriginalWithDistinct4Real) ResetPartialResult(pr PartialResult) { 113 p := (*partialResult4CountDistinctReal)(pr) 114 p.valSet = set.NewFloat64Set() 115 } 116 117 func (e *countOriginalWithDistinct4Real) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 118 p := (*partialResult4CountDistinctReal)(pr) 119 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 120 return nil 121 } 122 123 func (e *countOriginalWithDistinct4Real) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 124 p := (*partialResult4CountDistinctReal)(pr) 125 126 for _, event := range rowsInGroup { 127 input, isNull, err := e.args[0].EvalReal(sctx, event) 128 if err != nil { 129 return memDelta, err 130 } 131 if isNull { 132 continue 133 } 134 if p.valSet.Exist(input) { 135 continue 136 } 137 p.valSet.Insert(input) 138 memDelta += DefFloat64Size 139 } 140 141 return memDelta, nil 142 } 143 144 type partialResult4CountDistinctDecimal struct { 145 valSet set.StringSet 146 } 147 148 type countOriginalWithDistinct4Decimal struct { 149 baseCount 150 } 151 152 func (e *countOriginalWithDistinct4Decimal) AllocPartialResult() (pr PartialResult, memDelta int64) { 153 return PartialResult(&partialResult4CountDistinctDecimal{ 154 valSet: set.NewStringSet(), 155 }), DefPartialResult4CountDistinctDecimalSize 156 } 157 158 func (e *countOriginalWithDistinct4Decimal) ResetPartialResult(pr PartialResult) { 159 p := (*partialResult4CountDistinctDecimal)(pr) 160 p.valSet = set.NewStringSet() 161 } 162 163 func (e *countOriginalWithDistinct4Decimal) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 164 p := (*partialResult4CountDistinctDecimal)(pr) 165 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 166 return nil 167 } 168 169 func (e *countOriginalWithDistinct4Decimal) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 170 p := (*partialResult4CountDistinctDecimal)(pr) 171 172 for _, event := range rowsInGroup { 173 input, isNull, err := e.args[0].EvalDecimal(sctx, event) 174 if err != nil { 175 return memDelta, err 176 } 177 if isNull { 178 continue 179 } 180 hash, err := input.ToHashKey() 181 if err != nil { 182 return memDelta, err 183 } 184 decStr := string(replog.String(hash)) 185 if p.valSet.Exist(decStr) { 186 continue 187 } 188 p.valSet.Insert(decStr) 189 memDelta += int64(len(decStr)) 190 } 191 192 return memDelta, nil 193 } 194 195 type partialResult4CountDistinctDuration struct { 196 valSet set.Int64Set 197 } 198 199 type countOriginalWithDistinct4Duration struct { 200 baseCount 201 } 202 203 func (e *countOriginalWithDistinct4Duration) AllocPartialResult() (pr PartialResult, memDelta int64) { 204 return PartialResult(&partialResult4CountDistinctDuration{ 205 valSet: set.NewInt64Set(), 206 }), DefPartialResult4CountDistinctDurationSize 207 } 208 209 func (e *countOriginalWithDistinct4Duration) ResetPartialResult(pr PartialResult) { 210 p := (*partialResult4CountDistinctDuration)(pr) 211 p.valSet = set.NewInt64Set() 212 } 213 214 func (e *countOriginalWithDistinct4Duration) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 215 p := (*partialResult4CountDistinctDuration)(pr) 216 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 217 return nil 218 } 219 220 func (e *countOriginalWithDistinct4Duration) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 221 p := (*partialResult4CountDistinctDuration)(pr) 222 223 for _, event := range rowsInGroup { 224 input, isNull, err := e.args[0].EvalDuration(sctx, event) 225 if err != nil { 226 return memDelta, err 227 } 228 if isNull { 229 continue 230 } 231 232 if p.valSet.Exist(int64(input.Duration)) { 233 continue 234 } 235 p.valSet.Insert(int64(input.Duration)) 236 memDelta += DefInt64Size 237 } 238 239 return memDelta, nil 240 } 241 242 type partialResult4CountDistinctString struct { 243 valSet set.StringSet 244 } 245 246 type countOriginalWithDistinct4String struct { 247 baseCount 248 } 249 250 func (e *countOriginalWithDistinct4String) AllocPartialResult() (pr PartialResult, memDelta int64) { 251 return PartialResult(&partialResult4CountDistinctString{ 252 valSet: set.NewStringSet(), 253 }), DefPartialResult4CountDistinctStringSize 254 } 255 256 func (e *countOriginalWithDistinct4String) ResetPartialResult(pr PartialResult) { 257 p := (*partialResult4CountDistinctString)(pr) 258 p.valSet = set.NewStringSet() 259 } 260 261 func (e *countOriginalWithDistinct4String) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 262 p := (*partialResult4CountDistinctString)(pr) 263 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 264 return nil 265 } 266 267 func (e *countOriginalWithDistinct4String) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 268 p := (*partialResult4CountDistinctString)(pr) 269 defCauslator := defCauslate.GetDefCauslator(e.args[0].GetType().DefCauslate) 270 271 for _, event := range rowsInGroup { 272 input, isNull, err := e.args[0].EvalString(sctx, event) 273 if err != nil { 274 return memDelta, err 275 } 276 if isNull { 277 continue 278 } 279 input = string(defCauslator.Key(input)) 280 281 if p.valSet.Exist(input) { 282 continue 283 } 284 input = stringutil.Copy(input) 285 p.valSet.Insert(input) 286 memDelta += int64(len(input)) 287 } 288 289 return memDelta, nil 290 } 291 292 type countOriginalWithDistinct struct { 293 baseCount 294 } 295 296 type partialResult4CountWithDistinct struct { 297 valSet set.StringSet 298 } 299 300 func (e *countOriginalWithDistinct) AllocPartialResult() (pr PartialResult, memDelta int64) { 301 return PartialResult(&partialResult4CountWithDistinct{ 302 valSet: set.NewStringSet(), 303 }), DefPartialResult4CountWithDistinctSize 304 } 305 306 func (e *countOriginalWithDistinct) ResetPartialResult(pr PartialResult) { 307 p := (*partialResult4CountWithDistinct)(pr) 308 p.valSet = set.NewStringSet() 309 } 310 311 func (e *countOriginalWithDistinct) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 312 p := (*partialResult4CountWithDistinct)(pr) 313 chk.AppendInt64(e.ordinal, int64(p.valSet.Count())) 314 return nil 315 } 316 317 func (e *countOriginalWithDistinct) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 318 p := (*partialResult4CountWithDistinct)(pr) 319 320 encodedBytes := make([]byte, 0) 321 // Decimal struct is the biggest type we will use. 322 buf := make([]byte, types.MyDecimalStructSize) 323 324 for _, event := range rowsInGroup { 325 var err error 326 var hasNull, isNull bool 327 encodedBytes = encodedBytes[:0] 328 329 for i := 0; i < len(e.args) && !hasNull; i++ { 330 encodedBytes, isNull, err = evalAndEncode(sctx, e.args[i], event, buf, encodedBytes) 331 if err != nil { 332 return memDelta, err 333 } 334 if isNull { 335 hasNull = true 336 break 337 } 338 } 339 encodedString := string(encodedBytes) 340 if hasNull || p.valSet.Exist(encodedString) { 341 continue 342 } 343 p.valSet.Insert(encodedString) 344 memDelta += int64(len(encodedString)) 345 } 346 347 return memDelta, nil 348 } 349 350 // evalAndEncode eval one event with an memex and encode value to bytes. 351 func evalAndEncode( 352 sctx stochastikctx.Context, arg memex.Expression, 353 event chunk.Event, buf, encodedBytes []byte, 354 ) (_ []byte, isNull bool, err error) { 355 switch tp := arg.GetType().EvalType(); tp { 356 case types.ETInt: 357 var val int64 358 val, isNull, err = arg.EvalInt(sctx, event) 359 if err != nil || isNull { 360 break 361 } 362 encodedBytes = appendInt64(encodedBytes, buf, val) 363 case types.ETReal: 364 var val float64 365 val, isNull, err = arg.EvalReal(sctx, event) 366 if err != nil || isNull { 367 break 368 } 369 encodedBytes = appendFloat64(encodedBytes, buf, val) 370 case types.ETDecimal: 371 var val *types.MyDecimal 372 val, isNull, err = arg.EvalDecimal(sctx, event) 373 if err != nil || isNull { 374 break 375 } 376 encodedBytes, err = appendDecimal(encodedBytes, val) 377 case types.ETTimestamp, types.ETDatetime: 378 var val types.Time 379 val, isNull, err = arg.EvalTime(sctx, event) 380 if err != nil || isNull { 381 break 382 } 383 encodedBytes = appendTime(encodedBytes, buf, val) 384 case types.ETDuration: 385 var val types.Duration 386 val, isNull, err = arg.EvalDuration(sctx, event) 387 if err != nil || isNull { 388 break 389 } 390 encodedBytes = appendDuration(encodedBytes, buf, val) 391 case types.ETJson: 392 var val json.BinaryJSON 393 val, isNull, err = arg.EvalJSON(sctx, event) 394 if err != nil || isNull { 395 break 396 } 397 encodedBytes = appendJSON(encodedBytes, buf, val) 398 case types.ETString: 399 var val string 400 val, isNull, err = arg.EvalString(sctx, event) 401 if err != nil || isNull { 402 break 403 } 404 encodedBytes = codec.EncodeCompactBytes(encodedBytes, replog.Slice(val)) 405 default: 406 return nil, false, errors.Errorf("unsupported defCausumn type for encode %d", tp) 407 } 408 return encodedBytes, isNull, err 409 } 410 411 func appendInt64(encodedBytes, buf []byte, val int64) []byte { 412 *(*int64)(unsafe.Pointer(&buf[0])) = val 413 buf = buf[:8] 414 encodedBytes = append(encodedBytes, buf...) 415 return encodedBytes 416 } 417 418 func appendFloat64(encodedBytes, buf []byte, val float64) []byte { 419 *(*float64)(unsafe.Pointer(&buf[0])) = val 420 buf = buf[:8] 421 encodedBytes = append(encodedBytes, buf...) 422 return encodedBytes 423 } 424 425 func appendDecimal(encodedBytes []byte, val *types.MyDecimal) ([]byte, error) { 426 hash, err := val.ToHashKey() 427 encodedBytes = append(encodedBytes, hash...) 428 return encodedBytes, err 429 } 430 431 func writeTime(buf []byte, t types.Time) { 432 binary.BigEndian.PutUint16(buf, uint16(t.Year())) 433 buf[2] = uint8(t.Month()) 434 buf[3] = uint8(t.Day()) 435 buf[4] = uint8(t.Hour()) 436 buf[5] = uint8(t.Minute()) 437 buf[6] = uint8(t.Second()) 438 binary.BigEndian.PutUint32(buf[8:], uint32(t.Microsecond())) 439 buf[12] = t.Type() 440 buf[13] = uint8(t.Fsp()) 441 } 442 443 func appendTime(encodedBytes, buf []byte, val types.Time) []byte { 444 writeTime(buf, val) 445 buf = buf[:16] 446 encodedBytes = append(encodedBytes, buf...) 447 return encodedBytes 448 } 449 450 func appendDuration(encodedBytes, buf []byte, val types.Duration) []byte { 451 *(*types.Duration)(unsafe.Pointer(&buf[0])) = val 452 buf = buf[:16] 453 encodedBytes = append(encodedBytes, buf...) 454 return encodedBytes 455 } 456 457 func appendJSON(encodedBytes, _ []byte, val json.BinaryJSON) []byte { 458 encodedBytes = append(encodedBytes, val.TypeCode) 459 encodedBytes = append(encodedBytes, val.Value...) 460 return encodedBytes 461 } 462 463 func intHash64(x uint64) uint64 { 464 x ^= x >> 33 465 x *= 0xff51afd7ed558ccd 466 x ^= x >> 33 467 x *= 0xc4ceb9fe1a85ec53 468 x ^= x >> 33 469 return x 470 } 471 472 type baseApproxCountDistinct struct { 473 baseAggFunc 474 } 475 476 const ( 477 // The maximum degree of buffer size before the values are discarded 478 uniquesHashMaxSizeDegree uint8 = 17 479 // The maximum number of elements before the values are discarded 480 uniquesHashMaxSize = uint32(1) << (uniquesHashMaxSizeDegree - 1) 481 // Initial buffer size degree 482 uniquesHashSetInitialSizeDegree uint8 = 4 483 // The number of least significant bits used for thinning. The remaining high-order bits are used to determine the position in the hash causet. 484 uniquesHashBitsForSkip = 32 - uniquesHashMaxSizeDegree 485 ) 486 487 type approxCountDistinctHashValue uint32 488 489 // partialResult4ApproxCountDistinct use `BJKST` algorithm to compute approximate result of count distinct. 490 // According to an experimental survey http://www.vldb.org/pvldb/vol11/p499-harmouch.FIDelf, the error guarantee of BJKST 491 // was even better than the theoretical lower bounds. 492 // For the calculation state, it uses a sample of element hash values with a size up to uniquesHashMaxSize. Compared 493 // with the widely known HyperLogLog algorithm, this algorithm is less effective in terms of accuracy and 494 // memory consumption (even up to proportionality), but it is adaptive. This means that with fairly high accuracy, it 495 // consumes less memory during simultaneous computation of cardinality for a large number of data sets whose cardinality 496 // has power law distribution (i.e. in cases when most of the data sets are small). 497 // This algorithm is also very accurate for data sets with small cardinality and very efficient on CPU. If number of 498 // distinct element is more than 2^32, relative error may be high. 499 type partialResult4ApproxCountDistinct struct { 500 size uint32 /// Number of elements. 501 sizeDegree uint8 /// The size of the causet as a power of 2. 502 skiFIDelegree uint8 /// Skip elements not divisible by 2 ^ skiFIDelegree. 503 hasZero bool /// The hash causet contains an element with a hash value of 0. 504 buf []approxCountDistinctHashValue 505 } 506 507 // NewPartialResult4ApproxCountDistinct builds a partial result for agg function ApproxCountDistinct. 508 func NewPartialResult4ApproxCountDistinct() *partialResult4ApproxCountDistinct { 509 p := &partialResult4ApproxCountDistinct{} 510 p.reset() 511 return p 512 } 513 514 func (p *partialResult4ApproxCountDistinct) InsertHash64(x uint64) { 515 // no need to rehash, just cast into uint32 516 p.insertHash(approxCountDistinctHashValue(x)) 517 } 518 519 func (p *partialResult4ApproxCountDistinct) MemUsage() int64 { 520 return int64(len(p.buf)) * DefUint32Size 521 } 522 523 func (p *partialResult4ApproxCountDistinct) alloc(newSizeDegree uint8) { 524 p.size = 0 525 p.skiFIDelegree = 0 526 p.hasZero = false 527 p.buf = make([]approxCountDistinctHashValue, uint32(1)<<newSizeDegree) 528 p.sizeDegree = newSizeDegree 529 } 530 531 func (p *partialResult4ApproxCountDistinct) reset() { 532 p.alloc(uniquesHashSetInitialSizeDegree) 533 } 534 535 func max(a, b uint8) uint8 { 536 if a > b { 537 return a 538 } 539 540 return b 541 } 542 543 func (p *partialResult4ApproxCountDistinct) bufSize() uint32 { 544 return uint32(1) << p.sizeDegree 545 } 546 547 func (p *partialResult4ApproxCountDistinct) mask() uint32 { 548 return p.bufSize() - 1 549 } 550 551 func (p *partialResult4ApproxCountDistinct) place(x approxCountDistinctHashValue) uint32 { 552 return uint32(x>>uniquesHashBitsForSkip) & p.mask() 553 } 554 555 // Increase the size of the buffer 2 times or up to new size degree. 556 func (p *partialResult4ApproxCountDistinct) resize(newSizeDegree uint8) { 557 oldSize := p.bufSize() 558 oldBuf := p.buf 559 560 if 0 == newSizeDegree { 561 newSizeDegree = p.sizeDegree + 1 562 } 563 564 p.buf = make([]approxCountDistinctHashValue, uint32(1)<<newSizeDegree) 565 p.sizeDegree = newSizeDegree 566 567 // Move some items to new locations. 568 for i := uint32(0); i < oldSize; i++ { 569 x := oldBuf[i] 570 if x != 0 { 571 p.reinsertImpl(x) 572 } 573 } 574 } 575 576 func (p *partialResult4ApproxCountDistinct) readAndMerge(rb []byte) error { 577 rhsSkiFIDelegree := rb[0] 578 rb = rb[1:] 579 580 if rhsSkiFIDelegree > p.skiFIDelegree { 581 p.skiFIDelegree = rhsSkiFIDelegree 582 p.rehash() 583 } 584 585 rb, rhsSize, err := codec.DecodeUvarint(rb) 586 587 if err != nil { 588 return err 589 } 590 591 if rhsSize > uint64(uniquesHashMaxSize) { 592 return errors.New("Cannot read partialResult4ApproxCountDistinct: too large size degree") 593 } 594 595 if p.bufSize() < uint32(rhsSize) { 596 newSizeDegree := max(uniquesHashSetInitialSizeDegree, uint8(math.Log2(float64(rhsSize-1)))+2) 597 p.resize(newSizeDegree) 598 } 599 600 for i := uint32(0); i < uint32(rhsSize); i++ { 601 x := *(*approxCountDistinctHashValue)(unsafe.Pointer(&rb[0])) 602 rb = rb[4:] 603 p.insertHash(x) 604 } 605 606 return err 607 } 608 609 // Correct system errors due to defCauslisions during hashing in uint32. 610 func (p *partialResult4ApproxCountDistinct) fixedSize() uint64 { 611 if 0 == p.skiFIDelegree { 612 return uint64(p.size) 613 } 614 615 res := uint64(p.size) * (uint64(1) << p.skiFIDelegree) 616 617 // Pseudo-random remainder. 618 res += intHash64(uint64(p.size)) & ((uint64(1) << p.skiFIDelegree) - 1) 619 620 // When different elements randomly scattered across 2^32 buckets, filled buckets with average of `res` obtained. 621 p32 := uint64(1) << 32 622 fixedRes := math.Round(float64(p32) * (math.Log(float64(p32)) - math.Log(float64(p32-res)))) 623 return uint64(fixedRes) 624 } 625 626 func (p *partialResult4ApproxCountDistinct) insertHash(hashValue approxCountDistinctHashValue) { 627 if !p.good(hashValue) { 628 return 629 } 630 631 p.insertImpl(hashValue) 632 p.shrinkIfNeed() 633 } 634 635 // The value is divided by 2 ^ skip_degree 636 func (p *partialResult4ApproxCountDistinct) good(hash approxCountDistinctHashValue) bool { 637 return hash == ((hash >> p.skiFIDelegree) << p.skiFIDelegree) 638 } 639 640 // Insert a value 641 func (p *partialResult4ApproxCountDistinct) insertImpl(x approxCountDistinctHashValue) { 642 if x == 0 { 643 if !p.hasZero { 644 p.size += 1 645 } 646 p.hasZero = true 647 return 648 } 649 650 placeValue := p.place(x) 651 for p.buf[placeValue] != 0 && p.buf[placeValue] != x { 652 placeValue++ 653 placeValue &= p.mask() 654 } 655 656 if p.buf[placeValue] == x { 657 return 658 } 659 660 p.buf[placeValue] = x 661 p.size++ 662 } 663 664 // If the hash causet is full enough, then do resize. 665 // If there are too many items, then throw half the pieces until they are small enough. 666 func (p *partialResult4ApproxCountDistinct) shrinkIfNeed() { 667 if p.size > p.maxFill() { 668 if p.size > uniquesHashMaxSize { 669 for p.size > uniquesHashMaxSize { 670 p.skiFIDelegree++ 671 p.rehash() 672 } 673 } else { 674 p.resize(0) 675 } 676 } 677 } 678 679 func (p *partialResult4ApproxCountDistinct) maxFill() uint32 { 680 return uint32(1) << (p.sizeDegree - 1) 681 } 682 683 // Delete all values whose hashes do not divide by 2 ^ skip_degree 684 func (p *partialResult4ApproxCountDistinct) rehash() { 685 for i := uint32(0); i < p.bufSize(); i++ { 686 if p.buf[i] != 0 && !p.good(p.buf[i]) { 687 p.buf[i] = 0 688 p.size-- 689 } 690 } 691 692 for i := uint32(0); i < p.bufSize(); i++ { 693 if p.buf[i] != 0 && i != p.place(p.buf[i]) { 694 x := p.buf[i] 695 p.buf[i] = 0 696 p.reinsertImpl(x) 697 } 698 } 699 } 700 701 // Insert a value into the new buffer that was in the old buffer. 702 // Used when increasing the size of the buffer, as well as when reading from a file. 703 func (p *partialResult4ApproxCountDistinct) reinsertImpl(x approxCountDistinctHashValue) { 704 placeValue := p.place(x) 705 for p.buf[placeValue] != 0 { 706 placeValue++ 707 placeValue &= p.mask() 708 } 709 710 p.buf[placeValue] = x 711 } 712 713 func (p *partialResult4ApproxCountDistinct) merge(tar *partialResult4ApproxCountDistinct) { 714 if tar.skiFIDelegree > p.skiFIDelegree { 715 p.skiFIDelegree = tar.skiFIDelegree 716 p.rehash() 717 } 718 719 if !p.hasZero && tar.hasZero { 720 p.hasZero = true 721 p.size++ 722 p.shrinkIfNeed() 723 } 724 725 for i := uint32(0); i < tar.bufSize(); i++ { 726 if tar.buf[i] != 0 && p.good(tar.buf[i]) { 727 p.insertImpl(tar.buf[i]) 728 p.shrinkIfNeed() 729 } 730 } 731 } 732 733 func (p *partialResult4ApproxCountDistinct) Serialize() []byte { 734 var buf [4]byte 735 res := make([]byte, 0, 1+binary.MaxVarintLen64+p.size*4) 736 737 res = append(res, p.skiFIDelegree) 738 res = codec.EncodeUvarint(res, uint64(p.size)) 739 740 if p.hasZero { 741 binary.LittleEndian.PutUint32(buf[:], 0) 742 res = append(res, buf[:]...) 743 } 744 745 for i := uint32(0); i < p.bufSize(); i++ { 746 if p.buf[i] != 0 { 747 binary.LittleEndian.PutUint32(buf[:], uint32(p.buf[i])) 748 res = append(res, buf[:]...) 749 } 750 } 751 return res 752 } 753 754 func (e *baseApproxCountDistinct) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 755 p := (*partialResult4ApproxCountDistinct)(pr) 756 chk.AppendInt64(e.ordinal, int64(p.fixedSize())) 757 return nil 758 } 759 760 func (e *baseApproxCountDistinct) AllocPartialResult() (pr PartialResult, memDelta int64) { 761 return (PartialResult)(NewPartialResult4ApproxCountDistinct()), DefPartialResult4ApproxCountDistinctSize 762 } 763 764 func (e *baseApproxCountDistinct) ResetPartialResult(pr PartialResult) { 765 p := (*partialResult4ApproxCountDistinct)(pr) 766 p.reset() 767 } 768 769 func (e *baseApproxCountDistinct) MergePartialResult(sctx stochastikctx.Context, src, dst PartialResult) (memDelta int64, err error) { 770 p1, p2 := (*partialResult4ApproxCountDistinct)(src), (*partialResult4ApproxCountDistinct)(dst) 771 p2.merge(p1) 772 return 0, nil 773 } 774 775 type approxCountDistinctOriginal struct { 776 baseApproxCountDistinct 777 } 778 779 func (e *approxCountDistinctOriginal) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 780 p := (*partialResult4ApproxCountDistinct)(pr) 781 encodedBytes := make([]byte, 0) 782 // Decimal struct is the biggest type we will use. 783 buf := make([]byte, types.MyDecimalStructSize) 784 785 for _, event := range rowsInGroup { 786 var err error 787 var hasNull, isNull bool 788 encodedBytes = encodedBytes[:0] 789 790 for i := 0; i < len(e.args) && !hasNull; i++ { 791 encodedBytes, isNull, err = evalAndEncode(sctx, e.args[i], event, buf, encodedBytes) 792 if err != nil { 793 return memDelta, err 794 } 795 if isNull { 796 hasNull = true 797 break 798 } 799 } 800 if hasNull { 801 continue 802 } 803 oldMemUsage := p.MemUsage() 804 x := farm.Hash64(encodedBytes) 805 p.InsertHash64(x) 806 newMemUsage := p.MemUsage() 807 memDelta += newMemUsage - oldMemUsage 808 } 809 810 return memDelta, nil 811 } 812 813 type approxCountDistinctPartial1 struct { 814 approxCountDistinctOriginal 815 } 816 817 func (e *approxCountDistinctPartial1) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 818 p := (*partialResult4ApproxCountDistinct)(pr) 819 chk.AppendBytes(e.ordinal, p.Serialize()) 820 return nil 821 } 822 823 type approxCountDistinctPartial2 struct { 824 approxCountDistinctPartial1 825 } 826 827 func (e *approxCountDistinctPartial2) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) { 828 p := (*partialResult4ApproxCountDistinct)(pr) 829 for _, event := range rowsInGroup { 830 input, isNull, err := e.args[0].EvalString(sctx, event) 831 if err != nil { 832 return memDelta, err 833 } 834 835 if isNull { 836 continue 837 } 838 839 oldMemUsage := p.MemUsage() 840 err = p.readAndMerge(replog.Slice(input)) 841 if err != nil { 842 return memDelta, err 843 } 844 newMemUsage := p.MemUsage() 845 memDelta += newMemUsage - oldMemUsage 846 } 847 return memDelta, nil 848 } 849 850 type approxCountDistinctFinal struct { 851 approxCountDistinctPartial2 852 } 853 854 func (e *approxCountDistinctFinal) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error { 855 return e.baseApproxCountDistinct.AppendFinalResult2Chunk(sctx, pr, chk) 856 }