github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/common/hll.go (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package common
    16  
    17  import (
    18  	"bytes"
    19  	"github.com/uber/aresdb/utils"
    20  	"strings"
    21  
    22  	"github.com/pkg/errors"
    23  	memCom "github.com/uber/aresdb/memstore/common"
    24  	"io"
    25  	"math"
    26  	"sort"
    27  	"unsafe"
    28  )
    29  
    30  const (
    31  	// OldHLLDataHeader is the old magic header for migration
    32  	OldHLLDataHeader uint32 = 0xACED0101
    33  	// HLLDataHeader is the magic header written into serialized format of hyperloglog query result.
    34  	HLLDataHeader uint32 = 0xACED0102
    35  	// EnumDelimiter is the delimiter to delimit enum cases.
    36  	EnumDelimiter = "\u0000\n"
    37  	// DenseDataLength is the length of hll dense data in bytes.
    38  	DenseDataLength = 1 << 14 // 16kb
    39  	// DenseThreshold is the thresold to convert sparse value to dense value.
    40  	DenseThreshold = DenseDataLength / 4
    41  )
    42  
    43  // HLLData stores fields for serialize and deserialize an hyperloglog query result when client sets Content-Accept
    44  // header to be application/hll.
    45  // The serialized buffer of a hll data is in following format:
    46  //	 [uint32] magic_number [uint32] padding
    47  //
    48  //	-----------query result 0-------------------
    49  //	 <header>
    50  //	 [uint32] query result 0 size [uint8] error or result [3 bytes padding]
    51  //	 [uint8] num_enum_columns [uint8] bytes per dim ... [padding for 8 bytes]
    52  //	 [uint32] result_size [uint32] raw_dim_values_vector_length
    53  //	 [uint8] dim_index_0... [uint8] dim_index_n [padding for 8 bytes]
    54  //	 [uint32] data_type_0...[uint32] data_type_n [padding for 8 bytes]
    55  //
    56  //	 <enum cases 0>
    57  //	 [uint32_t] number of bytes of enum cases [uint16] column_index [2 bytes: padding]
    58  //	 <enum values 0> delimited by "\u0000\n" [padding for 8 bytes]
    59  //	 <end of header>
    60  //	 <raw dim values vector>
    61  //	 ...
    62  //	 [padding for 8 byte alignment]
    63  //
    64  //	 <raw hll dense vector>
    65  //	 ...
    66  //	------------error 1----------
    67  //	 [uint32] query result 1 size  [uint8] error or result [3 bytes padding]
    68  //	...
    69  type HLLData struct {
    70  	NumDimsPerDimWidth             DimCountsPerDimWidth
    71  	ResultSize                     uint32
    72  	PaddedRawDimValuesVectorLength uint32
    73  	PaddedHLLVectorLength          int64
    74  
    75  	DimIndexes []int
    76  	DataTypes  []memCom.DataType
    77  	// map from column id => enum cases. It will
    78  	// only include columns used in dimensions.
    79  	EnumDicts map[int][]string
    80  }
    81  
    82  // CalculateSizes returns the header size and total size of used by this hll data.
    83  func (data *HLLData) CalculateSizes() (uint32, int64) {
    84  	// num enum columns (1 byte)
    85  	var headerSize = 1
    86  	// Dims per width (1 byte * numDims)
    87  	headerSize += len(data.NumDimsPerDimWidth)
    88  	// padding for 8 bytes
    89  	headerSize = utils.AlignOffset(headerSize, 8)
    90  	// result size (4 bytes) + raw_dim_values_vector_length (4 bytes)
    91  	headerSize += 8
    92  
    93  	// Dim indexes.
    94  	headerSize += (len(data.DimIndexes) + 7) / 8 * 8
    95  
    96  	// Data types.
    97  	headerSize += (len(data.DataTypes)*4 + 7) / 8 * 8
    98  
    99  	// Enum cases.
   100  	for _, enumCases := range data.EnumDicts {
   101  		// number of bytes of enum cases + column index + padding = 8 bytes.
   102  		headerSize += int(8 + CalculateEnumCasesBytes(enumCases))
   103  	}
   104  
   105  	totalSize := int64(headerSize)
   106  
   107  	// Dim values.
   108  	totalSize += int64(data.PaddedRawDimValuesVectorLength)
   109  
   110  	// Counts.
   111  	totalSize += int64(2*data.ResultSize+7) / 8 * 8
   112  
   113  	// HLL dense vector.
   114  	totalSize += data.PaddedHLLVectorLength
   115  
   116  	return uint32(headerSize), totalSize
   117  }
   118  
   119  // CalculateEnumCasesBytes calculates how many bytes the enum case values will occupy including 8 bytes alignment.
   120  func CalculateEnumCasesBytes(enumCases []string) uint32 {
   121  	var size uint32
   122  
   123  	for _, enumCase := range enumCases {
   124  		size += uint32(len(enumCase))
   125  	}
   126  
   127  	// enum cases delimiters.
   128  	size += uint32(len(enumCases)) * 2
   129  
   130  	// align by 8 bytes.
   131  	return (size + 7) / 8 * 8
   132  }
   133  
   134  // HLLRegister is the register used in the sparse representation.
   135  type HLLRegister struct {
   136  	Index uint16 `json:"index"`
   137  	Rho   byte   `json:"rho"`
   138  }
   139  
   140  // HLL stores only the dense data for now.
   141  type HLL struct {
   142  	SparseData       []HLLRegister // Unsorted registers.
   143  	DenseData        []byte        // Rho by register index.
   144  	NonZeroRegisters uint16
   145  }
   146  
   147  // Merge merges (using max(rho)) the other HLL (sparse or dense) into this one (will be converted to dense).
   148  func (hll *HLL) Merge(other HLL) {
   149  	hll.ConvertToDense()
   150  	for _, register := range other.SparseData {
   151  		oldRho := hll.DenseData[register.Index]
   152  		if oldRho == 0 {
   153  			hll.NonZeroRegisters++
   154  		}
   155  		if oldRho < register.Rho {
   156  			hll.DenseData[register.Index] = register.Rho
   157  		}
   158  	}
   159  	for index, rho := range other.DenseData {
   160  		oldRho := hll.DenseData[index]
   161  		if oldRho == 0 && rho != 0 {
   162  			hll.NonZeroRegisters++
   163  		}
   164  		if oldRho < rho {
   165  			hll.DenseData[index] = rho
   166  		}
   167  	}
   168  }
   169  
   170  // ConvertToDense converts the HLL to dense format.
   171  func (hll *HLL) ConvertToDense() {
   172  	if len(hll.DenseData) != 0 {
   173  		return
   174  	}
   175  
   176  	hll.DenseData = make([]byte, 1<<hllP)
   177  	for _, register := range hll.SparseData {
   178  		hll.DenseData[register.Index] = register.Rho
   179  	}
   180  	hll.SparseData = nil
   181  }
   182  
   183  // ConvertToSparse try converting the hll to sparse format if it turns out to be cheaper.
   184  func (hll *HLL) ConvertToSparse() bool {
   185  	if hll.NonZeroRegisters*4 >= 1<<hllP {
   186  		return false
   187  	}
   188  	if hll.SparseData != nil {
   189  		return true
   190  	}
   191  	hll.SparseData = make([]HLLRegister, 0, hll.NonZeroRegisters)
   192  	for index, rho := range hll.DenseData {
   193  		if rho != 0 {
   194  			hll.SparseData = append(hll.SparseData, HLLRegister{uint16(index), rho})
   195  		}
   196  	}
   197  	hll.DenseData = nil
   198  	return true
   199  }
   200  
   201  // Set sets rho for the specified register index. Caller must ensure that each register is set no more than once.
   202  func (hll *HLL) Set(index uint16, rho byte) {
   203  	hll.NonZeroRegisters++
   204  
   205  	if len(hll.DenseData) != 0 {
   206  		hll.DenseData[index] = rho
   207  		return
   208  	}
   209  
   210  	hll.SparseData = append(hll.SparseData, HLLRegister{index, rho})
   211  
   212  	if hll.NonZeroRegisters*4 >= 1<<hllP {
   213  		hll.ConvertToDense()
   214  	}
   215  }
   216  
   217  func parseOldTimeseriesHLLResult(buffer []byte) (AQLQueryResult, error) {
   218  	// empty result buffer
   219  	if len(buffer) == 0 {
   220  		return AQLQueryResult{}, nil
   221  	}
   222  
   223  	reader := utils.NewStreamDataReader(bytes.NewBuffer(buffer))
   224  
   225  	numFourBytesDims, err := reader.ReadUint8()
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  
   230  	numTwoBytesDims, err := reader.ReadUint8()
   231  	if err != nil {
   232  		return nil, err
   233  	}
   234  
   235  	numOneBytesDims, err := reader.ReadUint8()
   236  	if err != nil {
   237  		return nil, err
   238  	}
   239  
   240  	numEnumColumns, err := reader.ReadUint8()
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  
   245  	totalDims := int(numFourBytesDims + numTwoBytesDims + numOneBytesDims)
   246  
   247  	numDimsPerDimWidth := DimCountsPerDimWidth{0, 0, numFourBytesDims, numTwoBytesDims, numOneBytesDims}
   248  
   249  	resultSize, err := reader.ReadUint32()
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  
   254  	paddedRawDimValuesVectorLength, err := reader.ReadUint32()
   255  	if err != nil {
   256  		return nil, err
   257  	}
   258  
   259  	if err := reader.SkipBytes(4); err != nil {
   260  		return nil, err
   261  	}
   262  
   263  	dimIndexes := make([]uint8, totalDims)
   264  
   265  	for i := range dimIndexes {
   266  		dimIndexes[i], err = reader.ReadUint8()
   267  		if err != nil {
   268  			return nil, err
   269  		}
   270  	}
   271  
   272  	if err = reader.ReadPadding(int(totalDims), 8); err != nil {
   273  		return nil, err
   274  	}
   275  
   276  	dataTypes := make([]memCom.DataType, totalDims)
   277  
   278  	for i := range dataTypes {
   279  		rawDataType, err := reader.ReadUint32()
   280  		if err != nil {
   281  			return nil, err
   282  		}
   283  
   284  		dataType, err := memCom.NewDataType(rawDataType)
   285  		if err != nil {
   286  			return nil, err
   287  		}
   288  
   289  		dataTypes[i] = dataType
   290  	}
   291  
   292  	if err = reader.ReadPadding(int(totalDims)*4, 8); err != nil {
   293  		return nil, err
   294  	}
   295  
   296  	enumDicts := make(map[int][]string)
   297  	var i uint8
   298  	for ; i < numEnumColumns; i++ {
   299  		enumCasesBytes, err := reader.ReadUint32()
   300  		if err != nil {
   301  			return nil, err
   302  		}
   303  
   304  		columnID, err := reader.ReadUint16()
   305  		if err != nil {
   306  			return nil, err
   307  		}
   308  		reader.SkipBytes(2)
   309  		rawEnumCases := make([]byte, enumCasesBytes)
   310  		if err = reader.Read(rawEnumCases); err != nil {
   311  			return nil, err
   312  		}
   313  
   314  		enumCases := strings.Split(string(rawEnumCases), EnumDelimiter)
   315  
   316  		// remove last empty element.
   317  		enumCases = enumCases[:len(enumCases)-1]
   318  		enumDicts[int(columnID)] = enumCases
   319  	}
   320  
   321  	headerSize := reader.GetBytesRead()
   322  
   323  	result := make(AQLQueryResult)
   324  
   325  	paddedCountLength := uint32(2*resultSize+7) / 8 * 8
   326  
   327  	dimValuesVector := unsafe.Pointer(&buffer[headerSize])
   328  
   329  	countVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength])
   330  
   331  	hllVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength+paddedCountLength])
   332  
   333  	dimOffsets := make([][2]int, totalDims)
   334  	dimValues := make([]*string, totalDims)
   335  
   336  	for i := 0; i < totalDims; i++ {
   337  		dimIndex := int(dimIndexes[i])
   338  		valueOffset, nullOffset := GetDimensionStartOffsets(numDimsPerDimWidth, dimIndex, int(resultSize))
   339  		dimOffsets[i] = [2]int{valueOffset, nullOffset}
   340  	}
   341  
   342  	var currentOffset int64
   343  
   344  	for i := 0; i < int(resultSize); i++ {
   345  		for dimIndex := 0; dimIndex < totalDims; dimIndex++ {
   346  			offsets := dimOffsets[dimIndex]
   347  			valueOffset, nullOffset := offsets[0], offsets[1]
   348  			valuePtr, nullPtr := memAccess(dimValuesVector, valueOffset), memAccess(dimValuesVector, nullOffset)
   349  			dimValues[dimIndex] = ReadDimension(valuePtr, nullPtr, i, dataTypes[dimIndex], enumDicts[dimIndex], nil, nil)
   350  		}
   351  
   352  		count := *(*uint16)(memAccess(countVector, int(2*i)))
   353  		hll := readHLL(hllVector, count, &currentOffset)
   354  		result.SetHLL(dimValues, hll)
   355  	}
   356  
   357  	return result, nil
   358  }
   359  
   360  func parseTimeseriesHLLResult(buffer []byte) (AQLQueryResult, error) {
   361  	// empty result buffer
   362  	if len(buffer) == 0 {
   363  		return AQLQueryResult{}, nil
   364  	}
   365  
   366  	reader := utils.NewStreamDataReader(bytes.NewBuffer(buffer))
   367  	numEnumColumns, err := reader.ReadUint8()
   368  	if err != nil {
   369  		return nil, err
   370  	}
   371  
   372  	var numDimsPerDimWidth DimCountsPerDimWidth
   373  	err = reader.Read([]byte(numDimsPerDimWidth[:]))
   374  	if err != nil {
   375  		return AQLQueryResult{}, nil
   376  	}
   377  
   378  	totalDims := 0
   379  	for _, dimCount := range numDimsPerDimWidth {
   380  		totalDims += int(dimCount)
   381  	}
   382  
   383  	err = reader.ReadPadding(int(reader.GetBytesRead()), 8)
   384  	if err != nil {
   385  		return nil, err
   386  	}
   387  
   388  	resultSize, err := reader.ReadUint32()
   389  	if err != nil {
   390  		return nil, err
   391  	}
   392  
   393  	paddedRawDimValuesVectorLength, err := reader.ReadUint32()
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  
   398  	dimIndexes := make([]uint8, totalDims)
   399  	for i := range dimIndexes {
   400  		dimIndexes[i], err = reader.ReadUint8()
   401  		if err != nil {
   402  			return nil, err
   403  		}
   404  	}
   405  
   406  	if err = reader.ReadPadding(int(totalDims), 8); err != nil {
   407  		return nil, err
   408  	}
   409  
   410  	dataTypes := make([]memCom.DataType, totalDims)
   411  
   412  	for i := range dataTypes {
   413  		rawDataType, err := reader.ReadUint32()
   414  		if err != nil {
   415  			return nil, err
   416  		}
   417  
   418  		dataType, err := memCom.NewDataType(rawDataType)
   419  		if err != nil {
   420  			return nil, err
   421  		}
   422  
   423  		dataTypes[i] = dataType
   424  	}
   425  
   426  	if err = reader.ReadPadding(int(totalDims)*4, 8); err != nil {
   427  		return nil, err
   428  	}
   429  
   430  	enumDicts := make(map[int][]string)
   431  	var i uint8
   432  	for ; i < numEnumColumns; i++ {
   433  		enumCasesBytes, err := reader.ReadUint32()
   434  		if err != nil {
   435  			return nil, err
   436  		}
   437  
   438  		columnID, err := reader.ReadUint16()
   439  		if err != nil {
   440  			return nil, err
   441  		}
   442  		reader.SkipBytes(2)
   443  		rawEnumCases := make([]byte, enumCasesBytes)
   444  		if err = reader.Read(rawEnumCases); err != nil {
   445  			return nil, err
   446  		}
   447  
   448  		enumCases := strings.Split(string(rawEnumCases), EnumDelimiter)
   449  
   450  		// remove last empty element.
   451  		enumCases = enumCases[:len(enumCases)-1]
   452  		enumDicts[int(columnID)] = enumCases
   453  	}
   454  
   455  	headerSize := reader.GetBytesRead()
   456  
   457  	result := make(AQLQueryResult)
   458  
   459  	paddedCountLength := uint32(2*resultSize+7) / 8 * 8
   460  
   461  	dimValuesVector := unsafe.Pointer(&buffer[headerSize])
   462  
   463  	countVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength])
   464  
   465  	hllVector := unsafe.Pointer(&buffer[headerSize+paddedRawDimValuesVectorLength+paddedCountLength])
   466  
   467  	dimOffsets := make([][2]int, totalDims)
   468  	dimValues := make([]*string, totalDims)
   469  
   470  	for i := 0; i < totalDims; i++ {
   471  		dimIndex := int(dimIndexes[i])
   472  		valueOffset, nullOffset := GetDimensionStartOffsets(numDimsPerDimWidth, dimIndex, int(resultSize))
   473  		dimOffsets[i] = [2]int{valueOffset, nullOffset}
   474  	}
   475  
   476  	var currentOffset int64
   477  
   478  	for i := 0; i < int(resultSize); i++ {
   479  		for dimIndex := 0; dimIndex < totalDims; dimIndex++ {
   480  			offsets := dimOffsets[dimIndex]
   481  			valueOffset, nullOffset := offsets[0], offsets[1]
   482  			valuePtr, nullPtr := memAccess(dimValuesVector, valueOffset), memAccess(dimValuesVector, nullOffset)
   483  			dimValues[dimIndex] = ReadDimension(valuePtr, nullPtr, i, dataTypes[dimIndex], enumDicts[dimIndex], nil, nil)
   484  		}
   485  
   486  		count := *(*uint16)(memAccess(countVector, int(2*i)))
   487  		hll := readHLL(hllVector, count, &currentOffset)
   488  		result.SetHLL(dimValues, hll)
   489  	}
   490  
   491  	return result, nil
   492  }
   493  
   494  // ComputeHLLResult computes hll result
   495  func ComputeHLLResult(result AQLQueryResult) AQLQueryResult {
   496  	return computeHLLResultRecursive(result).(AQLQueryResult)
   497  }
   498  
   499  // computeHLLResultRecursive computes hll value
   500  func computeHLLResultRecursive(result interface{}) interface{} {
   501  	switch r := result.(type) {
   502  	case AQLQueryResult:
   503  		for k, v := range r {
   504  			r[k] = computeHLLResultRecursive(v)
   505  		}
   506  		return r
   507  	case map[string]interface{}:
   508  		for k, v := range r {
   509  			r[k] = computeHLLResultRecursive(v)
   510  		}
   511  		return r
   512  	case HLL:
   513  		return r.Compute()
   514  	default:
   515  		// return original for all other types
   516  		return r
   517  	}
   518  }
   519  
   520  // NewTimeSeriesHLLResult creates a new NewTimeSeriesHLLResult and deserialize the buffer into the result.
   521  func NewTimeSeriesHLLResult(buffer []byte, magicHeader uint32) (AQLQueryResult, error) {
   522  	switch magicHeader {
   523  	case OldHLLDataHeader:
   524  		return parseOldTimeseriesHLLResult(buffer)
   525  	case HLLDataHeader:
   526  		return parseTimeseriesHLLResult(buffer)
   527  	default:
   528  		// should not happen
   529  		return nil, utils.StackError(nil, "magic header version unsupported: %d", magicHeader)
   530  	}
   531  }
   532  
   533  // memAccess access memory location with starting pointer and an offset.
   534  func memAccess(p unsafe.Pointer, offset int) unsafe.Pointer {
   535  	return unsafe.Pointer(uintptr(p) + uintptr(offset))
   536  }
   537  
   538  // readHLL reads the HLL struct from the raw buffer and returns next offset
   539  func readHLL(hllVector unsafe.Pointer, count uint16, currentOffset *int64) HLL {
   540  	var sparseData []HLLRegister
   541  	var nonZeroRegisters uint16
   542  	var denseData []byte
   543  	if count < DenseThreshold {
   544  		var i uint16
   545  		sparseData = make([]HLLRegister, 0, count)
   546  		for ; i < count; i++ {
   547  			data := *(*uint32)(memAccess(hllVector, int(*currentOffset)))
   548  			index := uint16(data) // Big-endian from UNHEX...
   549  			rho := byte((data >> 16) & 0xFF)
   550  			sparseData = append(sparseData, HLLRegister{
   551  				Index: index,
   552  				Rho:   rho,
   553  			})
   554  			*currentOffset += 4
   555  		}
   556  		nonZeroRegisters = count
   557  	} else {
   558  		denseData = (*(*[DenseDataLength]byte)((memAccess(hllVector, int(*currentOffset)))))[:]
   559  		*currentOffset += DenseDataLength
   560  		for _, b := range denseData {
   561  			if b != 0 {
   562  				nonZeroRegisters++
   563  			}
   564  		}
   565  	}
   566  
   567  	return HLL{
   568  		DenseData:        denseData,
   569  		SparseData:       sparseData,
   570  		NonZeroRegisters: nonZeroRegisters,
   571  	}
   572  }
   573  
   574  // ParseHLLQueryResults will parse the response body into a slice of query results and a slice of errors.
   575  func ParseHLLQueryResults(data []byte) (queryResults []AQLQueryResult, queryErrors []error, err error) {
   576  	reader := utils.NewStreamDataReader(bytes.NewBuffer(data))
   577  
   578  	var magicHeader uint32
   579  	magicHeader, err = reader.ReadUint32()
   580  	if err != nil {
   581  		return
   582  	}
   583  
   584  	if magicHeader != OldHLLDataHeader && magicHeader != HLLDataHeader {
   585  		err = utils.StackError(nil, "header %x does not match HLLDataHeader %x or %x",
   586  			magicHeader, OldHLLDataHeader, HLLDataHeader)
   587  		return
   588  	}
   589  
   590  	reader.SkipBytes(4)
   591  
   592  	var size uint32
   593  	var isErr uint8
   594  
   595  	for size, err = reader.ReadUint32(); err == nil; size, err = reader.ReadUint32() {
   596  		if isErr, err = reader.ReadUint8(); err != nil {
   597  			return
   598  		}
   599  
   600  		reader.SkipBytes(3)
   601  
   602  		bs := make([]byte, size)
   603  		err = reader.Read(bs)
   604  		if err != nil {
   605  			break
   606  		}
   607  
   608  		if isErr != 0 {
   609  			queryErrors = append(queryErrors, errors.New(string(bs)))
   610  			queryResults = append(queryResults, nil)
   611  		} else {
   612  			var res AQLQueryResult
   613  			if res, err = NewTimeSeriesHLLResult(bs, magicHeader); err != nil {
   614  				return
   615  			}
   616  			queryResults = append(queryResults, res)
   617  			queryErrors = append(queryErrors, nil)
   618  		}
   619  	}
   620  
   621  	if err == io.EOF {
   622  		err = nil
   623  	}
   624  	return
   625  }
   626  
   627  type hllBiasByDistance struct {
   628  	distance, bias float64
   629  }
   630  
   631  func getEstimateBias(estimate float64) float64 {
   632  	i := sort.Search(len(hllRawEstimates), func(i int) bool { return estimate < hllRawEstimates[i] })
   633  
   634  	// Find nearest k neighbors.
   635  	k := 6
   636  	startIdx := i - 1 - k
   637  	endIdx := i + k
   638  	if startIdx < 0 {
   639  		startIdx = 0
   640  	}
   641  	if endIdx > len(hllRawEstimates) {
   642  		endIdx = len(hllRawEstimates)
   643  	}
   644  	biases := make(hllBiasesByDistances, endIdx-startIdx)
   645  	for i := startIdx; i < endIdx; i++ {
   646  		biases[i-startIdx].distance = (hllRawEstimates[i] - estimate) * (hllRawEstimates[i] - estimate)
   647  		biases[i-startIdx].bias = hllBiases[i]
   648  	}
   649  	sort.Sort(biases)
   650  
   651  	biasSum := 0.0
   652  	for i := 0; i < k; i++ {
   653  		biasSum += biases[i].bias
   654  	}
   655  
   656  	return biasSum / float64(k)
   657  }
   658  
   659  // Decode decodes the HLL from cache cache.
   660  // Interprets as dense or sparse format based on len(data).
   661  func (hll *HLL) Decode(data []byte) {
   662  	if len(data) == 1<<hllP {
   663  		hll.DenseData = data
   664  		hll.SparseData = nil
   665  		hll.NonZeroRegisters = 0
   666  		for _, rho := range data {
   667  			if rho != 0 {
   668  				hll.NonZeroRegisters++
   669  			}
   670  		}
   671  	} else {
   672  		hll.DenseData = nil
   673  		hll.SparseData = make([]HLLRegister, len(data)/3)
   674  		hll.NonZeroRegisters = uint16(len(data) / 3)
   675  		for i := 0; i < len(data)/3; i++ {
   676  			var register HLLRegister
   677  			register.Index = uint16(data[i*3]) | (uint16(data[i*3+1]) << 8)
   678  			register.Rho = data[i*3+2]
   679  			hll.SparseData[i] = register
   680  		}
   681  	}
   682  }
   683  
   684  // Encode encodes the HLL for cache storage.
   685  // Dense format will have a length of 1<<hllP.
   686  // Sparse format will have a smaller length
   687  func (hll *HLL) Encode() []byte {
   688  	if len(hll.DenseData) != 0 {
   689  		return hll.DenseData
   690  	}
   691  	data := make([]byte, 3*len(hll.SparseData))
   692  	for i, register := range hll.SparseData {
   693  		data[i*3] = byte(register.Index & 0xff)
   694  		data[i*3+1] = byte(register.Index >> 8)
   695  		data[i*3+2] = register.Rho
   696  	}
   697  	return data
   698  }
   699  
   700  // Compute computes the result of the HLL.
   701  func (hll *HLL) Compute() float64 {
   702  	nonZeroRegisters := float64(hll.NonZeroRegisters)
   703  	m := float64(uint64(1) << hllP)
   704  
   705  	// Sum of reciproclas of rhos
   706  	var sumOfReciprocals float64
   707  	for _, register := range hll.SparseData {
   708  		sumOfReciprocals += 1.0 / float64(uint64(1)<<register.Rho)
   709  	}
   710  	if len(hll.DenseData) == 0 {
   711  		// Add missing rho reciprocals for sparse form.
   712  		sumOfReciprocals += m - nonZeroRegisters
   713  	}
   714  	for _, rho := range hll.DenseData {
   715  		sumOfReciprocals += 1.0 / float64(uint64(1)<<rho)
   716  	}
   717  
   718  	// Initial estimation.
   719  	alpha := 0.7213 / (1 + 1.079/m)
   720  	estimate := alpha * m * m / sumOfReciprocals
   721  
   722  	// Bias correction.
   723  	if estimate <= 5.0*m {
   724  		estimate -= getEstimateBias(estimate)
   725  	}
   726  
   727  	estimateH := estimate
   728  
   729  	if nonZeroRegisters < m {
   730  		// Linear counting
   731  		estimateH = m * math.Log(m/(m-nonZeroRegisters))
   732  	}
   733  
   734  	if estimateH <= hllThreshold {
   735  		estimate = estimateH
   736  	}
   737  
   738  	// Round
   739  	return float64(uint64(estimate))
   740  }
   741  
   742  type hllBiasesByDistances []hllBiasByDistance
   743  
   744  func (b hllBiasesByDistances) Len() int      { return len(b) }
   745  func (b hllBiasesByDistances) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   746  func (b hllBiasesByDistances) Less(i, j int) bool {
   747  	return b[i].distance < b[j].distance
   748  }
   749  
   750  // threshold and bias data taken from google's bias correction data set:
   751  // https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen#
   752  var hllP byte = 14
   753  
   754  var hllThreshold = 15500.0
   755  
   756  // precision 14
   757  var hllRawEstimates = []float64{
   758  	11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178,
   759  	13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134,
   760  	15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204,
   761  	17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408,
   762  	19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723,
   763  	21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728,
   764  	24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132,
   765  	26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008,
   766  	29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752,
   767  	32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849,
   768  	34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354,
   769  	37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036,
   770  	40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375,
   771  	43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724,
   772  	46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796,
   773  	50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319,
   774  	53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748,
   775  	56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962,
   776  	59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042,
   777  	62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916,
   778  	65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22,
   779  	69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006,
   780  	72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904,
   781  	75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962,
   782  	79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884}
   783  
   784  // precision 14
   785  var hllBiases = []float64{
   786  	11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178,
   787  	9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624,
   788  	8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516,
   789  	6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924,
   790  	5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127,
   791  	4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828,
   792  	3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494,
   793  	2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498,
   794  	2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849,
   795  	1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606,
   796  	1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436,
   797  	990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375,
   798  	810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858,
   799  	652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999,
   800  	541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002,
   801  	449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997,
   802  	357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999,
   803  	279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001,
   804  	218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268,
   805  	164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003,
   806  	117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028,
   807  	84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989,
   808  	50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979,
   809  	29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116,
   810  	13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615,
   811  	6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039,
   812  	-3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987,
   813  	-24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067,
   814  	-32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057,
   815  	-54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042,
   816  	-49.9551999999967, -42.6116000000038}