github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/common/hll.go (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package common 16 17 import ( 18 "bytes" 19 "github.com/uber/aresdb/utils" 20 "strings" 21 22 "github.com/pkg/errors" 23 memCom "github.com/uber/aresdb/memstore/common" 24 "io" 25 "math" 26 "sort" 27 "unsafe" 28 ) 29 30 const ( 31 // OldHLLDataHeader is the old magic header for migration 32 OldHLLDataHeader uint32 = 0xACED0101 33 // HLLDataHeader is the magic header written into serialized format of hyperloglog query result. 34 HLLDataHeader uint32 = 0xACED0102 35 // EnumDelimiter is the delimiter to delimit enum cases. 36 EnumDelimiter = "\u0000\n" 37 // DenseDataLength is the length of hll dense data in bytes. 38 DenseDataLength = 1 << 14 // 16kb 39 // DenseThreshold is the thresold to convert sparse value to dense value. 40 DenseThreshold = DenseDataLength / 4 41 ) 42 43 // HLLData stores fields for serialize and deserialize an hyperloglog query result when client sets Content-Accept 44 // header to be application/hll. 45 // The serialized buffer of a hll data is in following format: 46 // [uint32] magic_number [uint32] padding 47 // 48 // -----------query result 0------------------- 49 // <header> 50 // [uint32] query result 0 size [uint8] error or result [3 bytes padding] 51 // [uint8] num_enum_columns [uint8] bytes per dim ... [padding for 8 bytes] 52 // [uint32] result_size [uint32] raw_dim_values_vector_length 53 // [uint8] dim_index_0... [uint8] dim_index_n [padding for 8 bytes] 54 // [uint32] data_type_0...[uint32] data_type_n [padding for 8 bytes] 55 // 56 // <enum cases 0> 57 // [uint32_t] number of bytes of enum cases [uint16] column_index [2 bytes: padding] 58 // <enum values 0> delimited by "\u0000\n" [padding for 8 bytes] 59 // <end of header> 60 // <raw dim values vector> 61 // ... 62 // [padding for 8 byte alignment] 63 // 64 // <raw hll dense vector> 65 // ... 66 // ------------error 1---------- 67 // [uint32] query result 1 size [uint8] error or result [3 bytes padding] 68 // ... 69 type HLLData struct { 70 NumDimsPerDimWidth DimCountsPerDimWidth 71 ResultSize uint32 72 PaddedRawDimValuesVectorLength uint32 73 PaddedHLLVectorLength int64 74 75 DimIndexes []int 76 DataTypes []memCom.DataType 77 // map from column id => enum cases. It will 78 // only include columns used in dimensions. 79 EnumDicts map[int][]string 80 } 81 82 // CalculateSizes returns the header size and total size of used by this hll data. 83 func (data *HLLData) CalculateSizes() (uint32, int64) { 84 // num enum columns (1 byte) 85 var headerSize = 1 86 // Dims per width (1 byte * numDims) 87 headerSize += len(data.NumDimsPerDimWidth) 88 // padding for 8 bytes 89 headerSize = utils.AlignOffset(headerSize, 8) 90 // result size (4 bytes) + raw_dim_values_vector_length (4 bytes) 91 headerSize += 8 92 93 // Dim indexes. 94 headerSize += (len(data.DimIndexes) + 7) / 8 * 8 95 96 // Data types. 97 headerSize += (len(data.DataTypes)*4 + 7) / 8 * 8 98 99 // Enum cases. 100 for _, enumCases := range data.EnumDicts { 101 // number of bytes of enum cases + column index + padding = 8 bytes. 102 headerSize += int(8 + CalculateEnumCasesBytes(enumCases)) 103 } 104 105 totalSize := int64(headerSize) 106 107 // Dim values. 108 totalSize += int64(data.PaddedRawDimValuesVectorLength) 109 110 // Counts. 111 totalSize += int64(2*data.ResultSize+7) / 8 * 8 112 113 // HLL dense vector. 114 totalSize += data.PaddedHLLVectorLength 115 116 return uint32(headerSize), totalSize 117 } 118 119 // CalculateEnumCasesBytes calculates how many bytes the enum case values will occupy including 8 bytes alignment. 120 func CalculateEnumCasesBytes(enumCases []string) uint32 { 121 var size uint32 122 123 for _, enumCase := range enumCases { 124 size += uint32(len(enumCase)) 125 } 126 127 // enum cases delimiters. 128 size += uint32(len(enumCases)) * 2 129 130 // align by 8 bytes. 131 return (size + 7) / 8 * 8 132 } 133 134 // HLLRegister is the register used in the sparse representation. 135 type HLLRegister struct { 136 Index uint16 `json:"index"` 137 Rho byte `json:"rho"` 138 } 139 140 // HLL stores only the dense data for now. 141 type HLL struct { 142 SparseData []HLLRegister // Unsorted registers. 143 DenseData []byte // Rho by register index. 144 NonZeroRegisters uint16 145 } 146 147 // Merge merges (using max(rho)) the other HLL (sparse or dense) into this one (will be converted to dense). 148 func (hll *HLL) Merge(other HLL) { 149 hll.ConvertToDense() 150 for _, register := range other.SparseData { 151 oldRho := hll.DenseData[register.Index] 152 if oldRho == 0 { 153 hll.NonZeroRegisters++ 154 } 155 if oldRho < register.Rho { 156 hll.DenseData[register.Index] = register.Rho 157 } 158 } 159 for index, rho := range other.DenseData { 160 oldRho := hll.DenseData[index] 161 if oldRho == 0 && rho != 0 { 162 hll.NonZeroRegisters++ 163 } 164 if oldRho < rho { 165 hll.DenseData[index] = rho 166 } 167 } 168 } 169 170 // ConvertToDense converts the HLL to dense format. 171 func (hll *HLL) ConvertToDense() { 172 if len(hll.DenseData) != 0 { 173 return 174 } 175 176 hll.DenseData = make([]byte, 1<<hllP) 177 for _, register := range hll.SparseData { 178 hll.DenseData[register.Index] = register.Rho 179 } 180 hll.SparseData = nil 181 } 182 183 // ConvertToSparse try converting the hll to sparse format if it turns out to be cheaper. 184 func (hll *HLL) ConvertToSparse() bool { 185 if hll.NonZeroRegisters*4 >= 1<<hllP { 186 return false 187 } 188 if hll.SparseData != nil { 189 return true 190 } 191 hll.SparseData = make([]HLLRegister, 0, hll.NonZeroRegisters) 192 for index, rho := range hll.DenseData { 193 if rho != 0 { 194 hll.SparseData = append(hll.SparseData, HLLRegister{uint16(index), rho}) 195 } 196 } 197 hll.DenseData = nil 198 return true 199 } 200 201 // Set sets rho for the specified register index. Caller must ensure that each register is set no more than once. 202 func (hll *HLL) Set(index uint16, rho byte) { 203 hll.NonZeroRegisters++ 204 205 if len(hll.DenseData) != 0 { 206 hll.DenseData[index] = rho 207 return 208 } 209 210 hll.SparseData = append(hll.SparseData, HLLRegister{index, rho}) 211 212 if hll.NonZeroRegisters*4 >= 1<<hllP { 213 hll.ConvertToDense() 214 } 215 } 216 217 func parseOldTimeseriesHLLResult(buffer []byte) (AQLQueryResult, error) { 218 // empty result buffer 219 if len(buffer) == 0 { 220 return AQLQueryResult{}, nil 221 } 222 223 reader := utils.NewStreamDataReader(bytes.NewBuffer(buffer)) 224 225 numFourBytesDims, err := reader.ReadUint8() 226 if err != nil { 227 return nil, err 228 } 229 230 numTwoBytesDims, err := reader.ReadUint8() 231 if err != nil { 232 return nil, err 233 } 234 235 numOneBytesDims, err := reader.ReadUint8() 236 if err != nil { 237 return nil, err 238 } 239 240 numEnumColumns, err := reader.ReadUint8() 241 if err != nil { 242 return nil, err 243 } 244 245 totalDims := int(numFourBytesDims + numTwoBytesDims + numOneBytesDims) 246 247 numDimsPerDimWidth := DimCountsPerDimWidth{0, 0, numFourBytesDims, numTwoBytesDims, numOneBytesDims} 248 249 resultSize, err := reader.ReadUint32() 250 if err != nil { 251 return nil, err 252 } 253 254 paddedRawDimValuesVectorLength, err := reader.ReadUint32() 255 if err != nil { 256 return nil, err 257 } 258 259 if err := reader.SkipBytes(4); err != nil { 260 return nil, err 261 } 262 263 dimIndexes := make([]uint8, totalDims) 264 265 for i := range dimIndexes { 266 dimIndexes[i], err = reader.ReadUint8() 267 if err != nil { 268 return nil, err 269 } 270 } 271 272 if err = reader.ReadPadding(int(totalDims), 8); err != nil { 273 return nil, err 274 } 275 276 dataTypes := make([]memCom.DataType, totalDims) 277 278 for i := range dataTypes { 279 rawDataType, err := reader.ReadUint32() 280 if err != nil { 281 return nil, err 282 } 283 284 dataType, err := memCom.NewDataType(rawDataType) 285 if err != nil { 286 return nil, err 287 } 288 289 dataTypes[i] = dataType 290 } 291 292 if err = reader.ReadPadding(int(totalDims)*4, 8); err != nil { 293 return nil, err 294 } 295 296 enumDicts := make(map[int][]string) 297 var i uint8 298 for ; i < numEnumColumns; i++ { 299 enumCasesBytes, err := reader.ReadUint32() 300 if err != nil { 301 return nil, err 302 } 303 304 columnID, err := reader.ReadUint16() 305 if err != nil { 306 return nil, err 307 } 308 reader.SkipBytes(2) 309 rawEnumCases := make([]byte, enumCasesBytes) 310 if err = reader.Read(rawEnumCases); err != nil { 311 return nil, err 312 } 313 314 enumCases := strings.Split(string(rawEnumCases), EnumDelimiter) 315 316 // remove last empty element. 317 enumCases = enumCases[:len(enumCases)-1] 318 enumDicts[int(columnID)] = enumCases 319 } 320 321 headerSize := reader.GetBytesRead() 322 323 result := make(AQLQueryResult) 324 325 paddedCountLength := uint32(2*resultSize+7) / 8 * 8 326 327 dimValuesVector := unsafe.Pointer(&buffer[headerSize]) 328 329 countVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength]) 330 331 hllVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength+paddedCountLength]) 332 333 dimOffsets := make([][2]int, totalDims) 334 dimValues := make([]*string, totalDims) 335 336 for i := 0; i < totalDims; i++ { 337 dimIndex := int(dimIndexes[i]) 338 valueOffset, nullOffset := GetDimensionStartOffsets(numDimsPerDimWidth, dimIndex, int(resultSize)) 339 dimOffsets[i] = [2]int{valueOffset, nullOffset} 340 } 341 342 var currentOffset int64 343 344 for i := 0; i < int(resultSize); i++ { 345 for dimIndex := 0; dimIndex < totalDims; dimIndex++ { 346 offsets := dimOffsets[dimIndex] 347 valueOffset, nullOffset := offsets[0], offsets[1] 348 valuePtr, nullPtr := memAccess(dimValuesVector, valueOffset), memAccess(dimValuesVector, nullOffset) 349 dimValues[dimIndex] = ReadDimension(valuePtr, nullPtr, i, dataTypes[dimIndex], enumDicts[dimIndex], nil, nil) 350 } 351 352 count := *(*uint16)(memAccess(countVector, int(2*i))) 353 hll := readHLL(hllVector, count, ¤tOffset) 354 result.SetHLL(dimValues, hll) 355 } 356 357 return result, nil 358 } 359 360 func parseTimeseriesHLLResult(buffer []byte) (AQLQueryResult, error) { 361 // empty result buffer 362 if len(buffer) == 0 { 363 return AQLQueryResult{}, nil 364 } 365 366 reader := utils.NewStreamDataReader(bytes.NewBuffer(buffer)) 367 numEnumColumns, err := reader.ReadUint8() 368 if err != nil { 369 return nil, err 370 } 371 372 var numDimsPerDimWidth DimCountsPerDimWidth 373 err = reader.Read([]byte(numDimsPerDimWidth[:])) 374 if err != nil { 375 return AQLQueryResult{}, nil 376 } 377 378 totalDims := 0 379 for _, dimCount := range numDimsPerDimWidth { 380 totalDims += int(dimCount) 381 } 382 383 err = reader.ReadPadding(int(reader.GetBytesRead()), 8) 384 if err != nil { 385 return nil, err 386 } 387 388 resultSize, err := reader.ReadUint32() 389 if err != nil { 390 return nil, err 391 } 392 393 paddedRawDimValuesVectorLength, err := reader.ReadUint32() 394 if err != nil { 395 return nil, err 396 } 397 398 dimIndexes := make([]uint8, totalDims) 399 for i := range dimIndexes { 400 dimIndexes[i], err = reader.ReadUint8() 401 if err != nil { 402 return nil, err 403 } 404 } 405 406 if err = reader.ReadPadding(int(totalDims), 8); err != nil { 407 return nil, err 408 } 409 410 dataTypes := make([]memCom.DataType, totalDims) 411 412 for i := range dataTypes { 413 rawDataType, err := reader.ReadUint32() 414 if err != nil { 415 return nil, err 416 } 417 418 dataType, err := memCom.NewDataType(rawDataType) 419 if err != nil { 420 return nil, err 421 } 422 423 dataTypes[i] = dataType 424 } 425 426 if err = reader.ReadPadding(int(totalDims)*4, 8); err != nil { 427 return nil, err 428 } 429 430 enumDicts := make(map[int][]string) 431 var i uint8 432 for ; i < numEnumColumns; i++ { 433 enumCasesBytes, err := reader.ReadUint32() 434 if err != nil { 435 return nil, err 436 } 437 438 columnID, err := reader.ReadUint16() 439 if err != nil { 440 return nil, err 441 } 442 reader.SkipBytes(2) 443 rawEnumCases := make([]byte, enumCasesBytes) 444 if err = reader.Read(rawEnumCases); err != nil { 445 return nil, err 446 } 447 448 enumCases := strings.Split(string(rawEnumCases), EnumDelimiter) 449 450 // remove last empty element. 451 enumCases = enumCases[:len(enumCases)-1] 452 enumDicts[int(columnID)] = enumCases 453 } 454 455 headerSize := reader.GetBytesRead() 456 457 result := make(AQLQueryResult) 458 459 paddedCountLength := uint32(2*resultSize+7) / 8 * 8 460 461 dimValuesVector := unsafe.Pointer(&buffer[headerSize]) 462 463 countVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength]) 464 465 hllVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength+paddedCountLength]) 466 467 dimOffsets := make([][2]int, totalDims) 468 dimValues := make([]*string, totalDims) 469 470 for i := 0; i < totalDims; i++ { 471 dimIndex := int(dimIndexes[i]) 472 valueOffset, nullOffset := GetDimensionStartOffsets(numDimsPerDimWidth, dimIndex, int(resultSize)) 473 dimOffsets[i] = [2]int{valueOffset, nullOffset} 474 } 475 476 var currentOffset int64 477 478 for i := 0; i < int(resultSize); i++ { 479 for dimIndex := 0; dimIndex < totalDims; dimIndex++ { 480 offsets := dimOffsets[dimIndex] 481 valueOffset, nullOffset := offsets[0], offsets[1] 482 valuePtr, nullPtr := memAccess(dimValuesVector, valueOffset), memAccess(dimValuesVector, nullOffset) 483 dimValues[dimIndex] = ReadDimension(valuePtr, nullPtr, i, dataTypes[dimIndex], enumDicts[dimIndex], nil, nil) 484 } 485 486 count := *(*uint16)(memAccess(countVector, int(2*i))) 487 hll := readHLL(hllVector, count, ¤tOffset) 488 result.SetHLL(dimValues, hll) 489 } 490 491 return result, nil 492 } 493 494 // ComputeHLLResult computes hll result 495 func ComputeHLLResult(result AQLQueryResult) AQLQueryResult { 496 return computeHLLResultRecursive(result).(AQLQueryResult) 497 } 498 499 // computeHLLResultRecursive computes hll value 500 func computeHLLResultRecursive(result interface{}) interface{} { 501 switch r := result.(type) { 502 case AQLQueryResult: 503 for k, v := range r { 504 r[k] = computeHLLResultRecursive(v) 505 } 506 return r 507 case map[string]interface{}: 508 for k, v := range r { 509 r[k] = computeHLLResultRecursive(v) 510 } 511 return r 512 case HLL: 513 return r.Compute() 514 default: 515 // return original for all other types 516 return r 517 } 518 } 519 520 // NewTimeSeriesHLLResult creates a new NewTimeSeriesHLLResult and deserialize the buffer into the result. 521 func NewTimeSeriesHLLResult(buffer []byte, magicHeader uint32) (AQLQueryResult, error) { 522 switch magicHeader { 523 case OldHLLDataHeader: 524 return parseOldTimeseriesHLLResult(buffer) 525 case HLLDataHeader: 526 return parseTimeseriesHLLResult(buffer) 527 default: 528 // should not happen 529 return nil, utils.StackError(nil, "magic header version unsupported: %d", magicHeader) 530 } 531 } 532 533 // memAccess access memory location with starting pointer and an offset. 534 func memAccess(p unsafe.Pointer, offset int) unsafe.Pointer { 535 return unsafe.Pointer(uintptr(p) + uintptr(offset)) 536 } 537 538 // readHLL reads the HLL struct from the raw buffer and returns next offset 539 func readHLL(hllVector unsafe.Pointer, count uint16, currentOffset *int64) HLL { 540 var sparseData []HLLRegister 541 var nonZeroRegisters uint16 542 var denseData []byte 543 if count < DenseThreshold { 544 var i uint16 545 sparseData = make([]HLLRegister, 0, count) 546 for ; i < count; i++ { 547 data := *(*uint32)(memAccess(hllVector, int(*currentOffset))) 548 index := uint16(data) // Big-endian from UNHEX... 549 rho := byte((data >> 16) & 0xFF) 550 sparseData = append(sparseData, HLLRegister{ 551 Index: index, 552 Rho: rho, 553 }) 554 *currentOffset += 4 555 } 556 nonZeroRegisters = count 557 } else { 558 denseData = (*(*[DenseDataLength]byte)((memAccess(hllVector, int(*currentOffset)))))[:] 559 *currentOffset += DenseDataLength 560 for _, b := range denseData { 561 if b != 0 { 562 nonZeroRegisters++ 563 } 564 } 565 } 566 567 return HLL{ 568 DenseData: denseData, 569 SparseData: sparseData, 570 NonZeroRegisters: nonZeroRegisters, 571 } 572 } 573 574 // ParseHLLQueryResults will parse the response body into a slice of query results and a slice of errors. 575 func ParseHLLQueryResults(data []byte) (queryResults []AQLQueryResult, queryErrors []error, err error) { 576 reader := utils.NewStreamDataReader(bytes.NewBuffer(data)) 577 578 var magicHeader uint32 579 magicHeader, err = reader.ReadUint32() 580 if err != nil { 581 return 582 } 583 584 if magicHeader != OldHLLDataHeader && magicHeader != HLLDataHeader { 585 err = utils.StackError(nil, "header %x does not match HLLDataHeader %x or %x", 586 magicHeader, OldHLLDataHeader, HLLDataHeader) 587 return 588 } 589 590 reader.SkipBytes(4) 591 592 var size uint32 593 var isErr uint8 594 595 for size, err = reader.ReadUint32(); err == nil; size, err = reader.ReadUint32() { 596 if isErr, err = reader.ReadUint8(); err != nil { 597 return 598 } 599 600 reader.SkipBytes(3) 601 602 bs := make([]byte, size) 603 err = reader.Read(bs) 604 if err != nil { 605 break 606 } 607 608 if isErr != 0 { 609 queryErrors = append(queryErrors, errors.New(string(bs))) 610 queryResults = append(queryResults, nil) 611 } else { 612 var res AQLQueryResult 613 if res, err = NewTimeSeriesHLLResult(bs, magicHeader); err != nil { 614 return 615 } 616 queryResults = append(queryResults, res) 617 queryErrors = append(queryErrors, nil) 618 } 619 } 620 621 if err == io.EOF { 622 err = nil 623 } 624 return 625 } 626 627 type hllBiasByDistance struct { 628 distance, bias float64 629 } 630 631 func getEstimateBias(estimate float64) float64 { 632 i := sort.Search(len(hllRawEstimates), func(i int) bool { return estimate < hllRawEstimates[i] }) 633 634 // Find nearest k neighbors. 635 k := 6 636 startIdx := i - 1 - k 637 endIdx := i + k 638 if startIdx < 0 { 639 startIdx = 0 640 } 641 if endIdx > len(hllRawEstimates) { 642 endIdx = len(hllRawEstimates) 643 } 644 biases := make(hllBiasesByDistances, endIdx-startIdx) 645 for i := startIdx; i < endIdx; i++ { 646 biases[i-startIdx].distance = (hllRawEstimates[i] - estimate) * (hllRawEstimates[i] - estimate) 647 biases[i-startIdx].bias = hllBiases[i] 648 } 649 sort.Sort(biases) 650 651 biasSum := 0.0 652 for i := 0; i < k; i++ { 653 biasSum += biases[i].bias 654 } 655 656 return biasSum / float64(k) 657 } 658 659 // Decode decodes the HLL from cache cache. 660 // Interprets as dense or sparse format based on len(data). 661 func (hll *HLL) Decode(data []byte) { 662 if len(data) == 1<<hllP { 663 hll.DenseData = data 664 hll.SparseData = nil 665 hll.NonZeroRegisters = 0 666 for _, rho := range data { 667 if rho != 0 { 668 hll.NonZeroRegisters++ 669 } 670 } 671 } else { 672 hll.DenseData = nil 673 hll.SparseData = make([]HLLRegister, len(data)/3) 674 hll.NonZeroRegisters = uint16(len(data) / 3) 675 for i := 0; i < len(data)/3; i++ { 676 var register HLLRegister 677 register.Index = uint16(data[i*3]) | (uint16(data[i*3+1]) << 8) 678 register.Rho = data[i*3+2] 679 hll.SparseData[i] = register 680 } 681 } 682 } 683 684 // Encode encodes the HLL for cache storage. 685 // Dense format will have a length of 1<<hllP. 686 // Sparse format will have a smaller length 687 func (hll *HLL) Encode() []byte { 688 if len(hll.DenseData) != 0 { 689 return hll.DenseData 690 } 691 data := make([]byte, 3*len(hll.SparseData)) 692 for i, register := range hll.SparseData { 693 data[i*3] = byte(register.Index & 0xff) 694 data[i*3+1] = byte(register.Index >> 8) 695 data[i*3+2] = register.Rho 696 } 697 return data 698 } 699 700 // Compute computes the result of the HLL. 701 func (hll *HLL) Compute() float64 { 702 nonZeroRegisters := float64(hll.NonZeroRegisters) 703 m := float64(uint64(1) << hllP) 704 705 // Sum of reciproclas of rhos 706 var sumOfReciprocals float64 707 for _, register := range hll.SparseData { 708 sumOfReciprocals += 1.0 / float64(uint64(1)<<register.Rho) 709 } 710 if len(hll.DenseData) == 0 { 711 // Add missing rho reciprocals for sparse form. 712 sumOfReciprocals += m - nonZeroRegisters 713 } 714 for _, rho := range hll.DenseData { 715 sumOfReciprocals += 1.0 / float64(uint64(1)<<rho) 716 } 717 718 // Initial estimation. 719 alpha := 0.7213 / (1 + 1.079/m) 720 estimate := alpha * m * m / sumOfReciprocals 721 722 // Bias correction. 723 if estimate <= 5.0*m { 724 estimate -= getEstimateBias(estimate) 725 } 726 727 estimateH := estimate 728 729 if nonZeroRegisters < m { 730 // Linear counting 731 estimateH = m * math.Log(m/(m-nonZeroRegisters)) 732 } 733 734 if estimateH <= hllThreshold { 735 estimate = estimateH 736 } 737 738 // Round 739 return float64(uint64(estimate)) 740 } 741 742 type hllBiasesByDistances []hllBiasByDistance 743 744 func (b hllBiasesByDistances) Len() int { return len(b) } 745 func (b hllBiasesByDistances) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 746 func (b hllBiasesByDistances) Less(i, j int) bool { 747 return b[i].distance < b[j].distance 748 } 749 750 // threshold and bias data taken from google's bias correction data set: 751 // https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen# 752 var hllP byte = 14 753 754 var hllThreshold = 15500.0 755 756 // precision 14 757 var hllRawEstimates = []float64{ 758 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 759 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 760 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 761 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 762 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 763 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 764 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 765 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 766 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 767 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 768 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 769 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 770 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 771 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 772 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 773 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 774 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 775 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 776 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 777 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 778 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 779 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 780 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 781 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 782 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884} 783 784 // precision 14 785 var hllBiases = []float64{ 786 11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 787 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 788 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 789 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 790 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 791 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 792 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 793 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 794 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 795 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 796 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 797 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 798 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 799 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 800 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 801 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 802 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 803 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 804 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 805 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 806 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 807 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 808 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 809 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 810 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 811 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, 812 -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, 813 -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, 814 -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, 815 -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, 816 -49.9551999999967, -42.6116000000038}