github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/interlock/aggfuncs/func_count_distinct.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package aggfuncs
    15  
    16  import (
    17  	"encoding/binary"
    18  	"math"
    19  	"unsafe"
    20  
    21  	"github.com/dgryski/go-farm"
    22  	"github.com/whtcorpsinc/errors"
    23  	"github.com/whtcorpsinc/milevadb/memex"
    24  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    25  	"github.com/whtcorpsinc/milevadb/types"
    26  	"github.com/whtcorpsinc/milevadb/types/json"
    27  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    28  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    29  	"github.com/whtcorpsinc/milevadb/soliton/defCauslate"
    30  	"github.com/whtcorpsinc/milevadb/soliton/replog"
    31  	"github.com/whtcorpsinc/milevadb/soliton/set"
    32  	"github.com/whtcorpsinc/milevadb/soliton/stringutil"
    33  )
    34  
    35  const (
    36  	// DefPartialResult4CountDistinctIntSize is the size of partialResult4CountDistinctInt
    37  	DefPartialResult4CountDistinctIntSize = int64(unsafe.Sizeof(partialResult4CountDistinctInt{}))
    38  	// DefPartialResult4CountDistinctRealSize is the size of partialResult4CountDistinctReal
    39  	DefPartialResult4CountDistinctRealSize = int64(unsafe.Sizeof(partialResult4CountDistinctReal{}))
    40  	// DefPartialResult4CountDistinctDecimalSize is the size of partialResult4CountDistinctDecimal
    41  	DefPartialResult4CountDistinctDecimalSize = int64(unsafe.Sizeof(partialResult4CountDistinctDecimal{}))
    42  	// DefPartialResult4CountDistinctDurationSize is the size of partialResult4CountDistinctDuration
    43  	DefPartialResult4CountDistinctDurationSize = int64(unsafe.Sizeof(partialResult4CountDistinctDuration{}))
    44  	// DefPartialResult4CountDistinctStringSize is the size of partialResult4CountDistinctString
    45  	DefPartialResult4CountDistinctStringSize = int64(unsafe.Sizeof(partialResult4CountDistinctString{}))
    46  	// DefPartialResult4CountWithDistinctSize is the size of partialResult4CountWithDistinct
    47  	DefPartialResult4CountWithDistinctSize = int64(unsafe.Sizeof(partialResult4CountWithDistinct{}))
    48  	// DefPartialResult4ApproxCountDistinctSize is the size of partialResult4ApproxCountDistinct
    49  	DefPartialResult4ApproxCountDistinctSize = int64(unsafe.Sizeof(partialResult4ApproxCountDistinct{}))
    50  )
    51  
    52  type partialResult4CountDistinctInt struct {
    53  	valSet set.Int64Set
    54  }
    55  
    56  type countOriginalWithDistinct4Int struct {
    57  	baseCount
    58  }
    59  
    60  func (e *countOriginalWithDistinct4Int) AllocPartialResult() (pr PartialResult, memDelta int64) {
    61  	return PartialResult(&partialResult4CountDistinctInt{
    62  		valSet: set.NewInt64Set(),
    63  	}), DefPartialResult4CountDistinctIntSize
    64  }
    65  
    66  func (e *countOriginalWithDistinct4Int) ResetPartialResult(pr PartialResult) {
    67  	p := (*partialResult4CountDistinctInt)(pr)
    68  	p.valSet = set.NewInt64Set()
    69  }
    70  
    71  func (e *countOriginalWithDistinct4Int) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
    72  	p := (*partialResult4CountDistinctInt)(pr)
    73  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
    74  	return nil
    75  }
    76  
    77  func (e *countOriginalWithDistinct4Int) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
    78  	p := (*partialResult4CountDistinctInt)(pr)
    79  
    80  	for _, event := range rowsInGroup {
    81  		input, isNull, err := e.args[0].EvalInt(sctx, event)
    82  		if err != nil {
    83  			return memDelta, err
    84  		}
    85  		if isNull {
    86  			continue
    87  		}
    88  		if p.valSet.Exist(input) {
    89  			continue
    90  		}
    91  		p.valSet.Insert(input)
    92  		memDelta += DefInt64Size
    93  	}
    94  
    95  	return memDelta, nil
    96  }
    97  
    98  type partialResult4CountDistinctReal struct {
    99  	valSet set.Float64Set
   100  }
   101  
   102  type countOriginalWithDistinct4Real struct {
   103  	baseCount
   104  }
   105  
   106  func (e *countOriginalWithDistinct4Real) AllocPartialResult() (pr PartialResult, memDelta int64) {
   107  	return PartialResult(&partialResult4CountDistinctReal{
   108  		valSet: set.NewFloat64Set(),
   109  	}), DefPartialResult4CountDistinctRealSize
   110  }
   111  
   112  func (e *countOriginalWithDistinct4Real) ResetPartialResult(pr PartialResult) {
   113  	p := (*partialResult4CountDistinctReal)(pr)
   114  	p.valSet = set.NewFloat64Set()
   115  }
   116  
   117  func (e *countOriginalWithDistinct4Real) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   118  	p := (*partialResult4CountDistinctReal)(pr)
   119  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
   120  	return nil
   121  }
   122  
   123  func (e *countOriginalWithDistinct4Real) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   124  	p := (*partialResult4CountDistinctReal)(pr)
   125  
   126  	for _, event := range rowsInGroup {
   127  		input, isNull, err := e.args[0].EvalReal(sctx, event)
   128  		if err != nil {
   129  			return memDelta, err
   130  		}
   131  		if isNull {
   132  			continue
   133  		}
   134  		if p.valSet.Exist(input) {
   135  			continue
   136  		}
   137  		p.valSet.Insert(input)
   138  		memDelta += DefFloat64Size
   139  	}
   140  
   141  	return memDelta, nil
   142  }
   143  
   144  type partialResult4CountDistinctDecimal struct {
   145  	valSet set.StringSet
   146  }
   147  
   148  type countOriginalWithDistinct4Decimal struct {
   149  	baseCount
   150  }
   151  
   152  func (e *countOriginalWithDistinct4Decimal) AllocPartialResult() (pr PartialResult, memDelta int64) {
   153  	return PartialResult(&partialResult4CountDistinctDecimal{
   154  		valSet: set.NewStringSet(),
   155  	}), DefPartialResult4CountDistinctDecimalSize
   156  }
   157  
   158  func (e *countOriginalWithDistinct4Decimal) ResetPartialResult(pr PartialResult) {
   159  	p := (*partialResult4CountDistinctDecimal)(pr)
   160  	p.valSet = set.NewStringSet()
   161  }
   162  
   163  func (e *countOriginalWithDistinct4Decimal) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   164  	p := (*partialResult4CountDistinctDecimal)(pr)
   165  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
   166  	return nil
   167  }
   168  
   169  func (e *countOriginalWithDistinct4Decimal) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   170  	p := (*partialResult4CountDistinctDecimal)(pr)
   171  
   172  	for _, event := range rowsInGroup {
   173  		input, isNull, err := e.args[0].EvalDecimal(sctx, event)
   174  		if err != nil {
   175  			return memDelta, err
   176  		}
   177  		if isNull {
   178  			continue
   179  		}
   180  		hash, err := input.ToHashKey()
   181  		if err != nil {
   182  			return memDelta, err
   183  		}
   184  		decStr := string(replog.String(hash))
   185  		if p.valSet.Exist(decStr) {
   186  			continue
   187  		}
   188  		p.valSet.Insert(decStr)
   189  		memDelta += int64(len(decStr))
   190  	}
   191  
   192  	return memDelta, nil
   193  }
   194  
   195  type partialResult4CountDistinctDuration struct {
   196  	valSet set.Int64Set
   197  }
   198  
   199  type countOriginalWithDistinct4Duration struct {
   200  	baseCount
   201  }
   202  
   203  func (e *countOriginalWithDistinct4Duration) AllocPartialResult() (pr PartialResult, memDelta int64) {
   204  	return PartialResult(&partialResult4CountDistinctDuration{
   205  		valSet: set.NewInt64Set(),
   206  	}), DefPartialResult4CountDistinctDurationSize
   207  }
   208  
   209  func (e *countOriginalWithDistinct4Duration) ResetPartialResult(pr PartialResult) {
   210  	p := (*partialResult4CountDistinctDuration)(pr)
   211  	p.valSet = set.NewInt64Set()
   212  }
   213  
   214  func (e *countOriginalWithDistinct4Duration) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   215  	p := (*partialResult4CountDistinctDuration)(pr)
   216  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
   217  	return nil
   218  }
   219  
   220  func (e *countOriginalWithDistinct4Duration) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   221  	p := (*partialResult4CountDistinctDuration)(pr)
   222  
   223  	for _, event := range rowsInGroup {
   224  		input, isNull, err := e.args[0].EvalDuration(sctx, event)
   225  		if err != nil {
   226  			return memDelta, err
   227  		}
   228  		if isNull {
   229  			continue
   230  		}
   231  
   232  		if p.valSet.Exist(int64(input.Duration)) {
   233  			continue
   234  		}
   235  		p.valSet.Insert(int64(input.Duration))
   236  		memDelta += DefInt64Size
   237  	}
   238  
   239  	return memDelta, nil
   240  }
   241  
   242  type partialResult4CountDistinctString struct {
   243  	valSet set.StringSet
   244  }
   245  
   246  type countOriginalWithDistinct4String struct {
   247  	baseCount
   248  }
   249  
   250  func (e *countOriginalWithDistinct4String) AllocPartialResult() (pr PartialResult, memDelta int64) {
   251  	return PartialResult(&partialResult4CountDistinctString{
   252  		valSet: set.NewStringSet(),
   253  	}), DefPartialResult4CountDistinctStringSize
   254  }
   255  
   256  func (e *countOriginalWithDistinct4String) ResetPartialResult(pr PartialResult) {
   257  	p := (*partialResult4CountDistinctString)(pr)
   258  	p.valSet = set.NewStringSet()
   259  }
   260  
   261  func (e *countOriginalWithDistinct4String) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   262  	p := (*partialResult4CountDistinctString)(pr)
   263  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
   264  	return nil
   265  }
   266  
   267  func (e *countOriginalWithDistinct4String) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   268  	p := (*partialResult4CountDistinctString)(pr)
   269  	defCauslator := defCauslate.GetDefCauslator(e.args[0].GetType().DefCauslate)
   270  
   271  	for _, event := range rowsInGroup {
   272  		input, isNull, err := e.args[0].EvalString(sctx, event)
   273  		if err != nil {
   274  			return memDelta, err
   275  		}
   276  		if isNull {
   277  			continue
   278  		}
   279  		input = string(defCauslator.Key(input))
   280  
   281  		if p.valSet.Exist(input) {
   282  			continue
   283  		}
   284  		input = stringutil.Copy(input)
   285  		p.valSet.Insert(input)
   286  		memDelta += int64(len(input))
   287  	}
   288  
   289  	return memDelta, nil
   290  }
   291  
   292  type countOriginalWithDistinct struct {
   293  	baseCount
   294  }
   295  
   296  type partialResult4CountWithDistinct struct {
   297  	valSet set.StringSet
   298  }
   299  
   300  func (e *countOriginalWithDistinct) AllocPartialResult() (pr PartialResult, memDelta int64) {
   301  	return PartialResult(&partialResult4CountWithDistinct{
   302  		valSet: set.NewStringSet(),
   303  	}), DefPartialResult4CountWithDistinctSize
   304  }
   305  
   306  func (e *countOriginalWithDistinct) ResetPartialResult(pr PartialResult) {
   307  	p := (*partialResult4CountWithDistinct)(pr)
   308  	p.valSet = set.NewStringSet()
   309  }
   310  
   311  func (e *countOriginalWithDistinct) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   312  	p := (*partialResult4CountWithDistinct)(pr)
   313  	chk.AppendInt64(e.ordinal, int64(p.valSet.Count()))
   314  	return nil
   315  }
   316  
   317  func (e *countOriginalWithDistinct) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   318  	p := (*partialResult4CountWithDistinct)(pr)
   319  
   320  	encodedBytes := make([]byte, 0)
   321  	// Decimal struct is the biggest type we will use.
   322  	buf := make([]byte, types.MyDecimalStructSize)
   323  
   324  	for _, event := range rowsInGroup {
   325  		var err error
   326  		var hasNull, isNull bool
   327  		encodedBytes = encodedBytes[:0]
   328  
   329  		for i := 0; i < len(e.args) && !hasNull; i++ {
   330  			encodedBytes, isNull, err = evalAndEncode(sctx, e.args[i], event, buf, encodedBytes)
   331  			if err != nil {
   332  				return memDelta, err
   333  			}
   334  			if isNull {
   335  				hasNull = true
   336  				break
   337  			}
   338  		}
   339  		encodedString := string(encodedBytes)
   340  		if hasNull || p.valSet.Exist(encodedString) {
   341  			continue
   342  		}
   343  		p.valSet.Insert(encodedString)
   344  		memDelta += int64(len(encodedString))
   345  	}
   346  
   347  	return memDelta, nil
   348  }
   349  
   350  // evalAndEncode eval one event with an memex and encode value to bytes.
   351  func evalAndEncode(
   352  	sctx stochastikctx.Context, arg memex.Expression,
   353  	event chunk.Event, buf, encodedBytes []byte,
   354  ) (_ []byte, isNull bool, err error) {
   355  	switch tp := arg.GetType().EvalType(); tp {
   356  	case types.ETInt:
   357  		var val int64
   358  		val, isNull, err = arg.EvalInt(sctx, event)
   359  		if err != nil || isNull {
   360  			break
   361  		}
   362  		encodedBytes = appendInt64(encodedBytes, buf, val)
   363  	case types.ETReal:
   364  		var val float64
   365  		val, isNull, err = arg.EvalReal(sctx, event)
   366  		if err != nil || isNull {
   367  			break
   368  		}
   369  		encodedBytes = appendFloat64(encodedBytes, buf, val)
   370  	case types.ETDecimal:
   371  		var val *types.MyDecimal
   372  		val, isNull, err = arg.EvalDecimal(sctx, event)
   373  		if err != nil || isNull {
   374  			break
   375  		}
   376  		encodedBytes, err = appendDecimal(encodedBytes, val)
   377  	case types.ETTimestamp, types.ETDatetime:
   378  		var val types.Time
   379  		val, isNull, err = arg.EvalTime(sctx, event)
   380  		if err != nil || isNull {
   381  			break
   382  		}
   383  		encodedBytes = appendTime(encodedBytes, buf, val)
   384  	case types.ETDuration:
   385  		var val types.Duration
   386  		val, isNull, err = arg.EvalDuration(sctx, event)
   387  		if err != nil || isNull {
   388  			break
   389  		}
   390  		encodedBytes = appendDuration(encodedBytes, buf, val)
   391  	case types.ETJson:
   392  		var val json.BinaryJSON
   393  		val, isNull, err = arg.EvalJSON(sctx, event)
   394  		if err != nil || isNull {
   395  			break
   396  		}
   397  		encodedBytes = appendJSON(encodedBytes, buf, val)
   398  	case types.ETString:
   399  		var val string
   400  		val, isNull, err = arg.EvalString(sctx, event)
   401  		if err != nil || isNull {
   402  			break
   403  		}
   404  		encodedBytes = codec.EncodeCompactBytes(encodedBytes, replog.Slice(val))
   405  	default:
   406  		return nil, false, errors.Errorf("unsupported defCausumn type for encode %d", tp)
   407  	}
   408  	return encodedBytes, isNull, err
   409  }
   410  
   411  func appendInt64(encodedBytes, buf []byte, val int64) []byte {
   412  	*(*int64)(unsafe.Pointer(&buf[0])) = val
   413  	buf = buf[:8]
   414  	encodedBytes = append(encodedBytes, buf...)
   415  	return encodedBytes
   416  }
   417  
   418  func appendFloat64(encodedBytes, buf []byte, val float64) []byte {
   419  	*(*float64)(unsafe.Pointer(&buf[0])) = val
   420  	buf = buf[:8]
   421  	encodedBytes = append(encodedBytes, buf...)
   422  	return encodedBytes
   423  }
   424  
   425  func appendDecimal(encodedBytes []byte, val *types.MyDecimal) ([]byte, error) {
   426  	hash, err := val.ToHashKey()
   427  	encodedBytes = append(encodedBytes, hash...)
   428  	return encodedBytes, err
   429  }
   430  
   431  func writeTime(buf []byte, t types.Time) {
   432  	binary.BigEndian.PutUint16(buf, uint16(t.Year()))
   433  	buf[2] = uint8(t.Month())
   434  	buf[3] = uint8(t.Day())
   435  	buf[4] = uint8(t.Hour())
   436  	buf[5] = uint8(t.Minute())
   437  	buf[6] = uint8(t.Second())
   438  	binary.BigEndian.PutUint32(buf[8:], uint32(t.Microsecond()))
   439  	buf[12] = t.Type()
   440  	buf[13] = uint8(t.Fsp())
   441  }
   442  
   443  func appendTime(encodedBytes, buf []byte, val types.Time) []byte {
   444  	writeTime(buf, val)
   445  	buf = buf[:16]
   446  	encodedBytes = append(encodedBytes, buf...)
   447  	return encodedBytes
   448  }
   449  
   450  func appendDuration(encodedBytes, buf []byte, val types.Duration) []byte {
   451  	*(*types.Duration)(unsafe.Pointer(&buf[0])) = val
   452  	buf = buf[:16]
   453  	encodedBytes = append(encodedBytes, buf...)
   454  	return encodedBytes
   455  }
   456  
   457  func appendJSON(encodedBytes, _ []byte, val json.BinaryJSON) []byte {
   458  	encodedBytes = append(encodedBytes, val.TypeCode)
   459  	encodedBytes = append(encodedBytes, val.Value...)
   460  	return encodedBytes
   461  }
   462  
   463  func intHash64(x uint64) uint64 {
   464  	x ^= x >> 33
   465  	x *= 0xff51afd7ed558ccd
   466  	x ^= x >> 33
   467  	x *= 0xc4ceb9fe1a85ec53
   468  	x ^= x >> 33
   469  	return x
   470  }
   471  
   472  type baseApproxCountDistinct struct {
   473  	baseAggFunc
   474  }
   475  
   476  const (
   477  	// The maximum degree of buffer size before the values are discarded
   478  	uniquesHashMaxSizeDegree uint8 = 17
   479  	// The maximum number of elements before the values are discarded
   480  	uniquesHashMaxSize = uint32(1) << (uniquesHashMaxSizeDegree - 1)
   481  	// Initial buffer size degree
   482  	uniquesHashSetInitialSizeDegree uint8 = 4
   483  	// The number of least significant bits used for thinning. The remaining high-order bits are used to determine the position in the hash causet.
   484  	uniquesHashBitsForSkip = 32 - uniquesHashMaxSizeDegree
   485  )
   486  
   487  type approxCountDistinctHashValue uint32
   488  
   489  // partialResult4ApproxCountDistinct use `BJKST` algorithm to compute approximate result of count distinct.
   490  // According to an experimental survey http://www.vldb.org/pvldb/vol11/p499-harmouch.FIDelf, the error guarantee of BJKST
   491  // was even better than the theoretical lower bounds.
   492  // For the calculation state, it uses a sample of element hash values with a size up to uniquesHashMaxSize. Compared
   493  // with the widely known HyperLogLog algorithm, this algorithm is less effective in terms of accuracy and
   494  // memory consumption (even up to proportionality), but it is adaptive. This means that with fairly high accuracy, it
   495  // consumes less memory during simultaneous computation of cardinality for a large number of data sets whose cardinality
   496  // has power law distribution (i.e. in cases when most of the data sets are small).
   497  // This algorithm is also very accurate for data sets with small cardinality and very efficient on CPU. If number of
   498  // distinct element is more than 2^32, relative error may be high.
   499  type partialResult4ApproxCountDistinct struct {
   500  	size       uint32 /// Number of elements.
   501  	sizeDegree uint8  /// The size of the causet as a power of 2.
   502  	skiFIDelegree uint8  /// Skip elements not divisible by 2 ^ skiFIDelegree.
   503  	hasZero    bool   /// The hash causet contains an element with a hash value of 0.
   504  	buf        []approxCountDistinctHashValue
   505  }
   506  
   507  // NewPartialResult4ApproxCountDistinct builds a partial result for agg function ApproxCountDistinct.
   508  func NewPartialResult4ApproxCountDistinct() *partialResult4ApproxCountDistinct {
   509  	p := &partialResult4ApproxCountDistinct{}
   510  	p.reset()
   511  	return p
   512  }
   513  
   514  func (p *partialResult4ApproxCountDistinct) InsertHash64(x uint64) {
   515  	// no need to rehash, just cast into uint32
   516  	p.insertHash(approxCountDistinctHashValue(x))
   517  }
   518  
   519  func (p *partialResult4ApproxCountDistinct) MemUsage() int64 {
   520  	return int64(len(p.buf)) * DefUint32Size
   521  }
   522  
   523  func (p *partialResult4ApproxCountDistinct) alloc(newSizeDegree uint8) {
   524  	p.size = 0
   525  	p.skiFIDelegree = 0
   526  	p.hasZero = false
   527  	p.buf = make([]approxCountDistinctHashValue, uint32(1)<<newSizeDegree)
   528  	p.sizeDegree = newSizeDegree
   529  }
   530  
   531  func (p *partialResult4ApproxCountDistinct) reset() {
   532  	p.alloc(uniquesHashSetInitialSizeDegree)
   533  }
   534  
   535  func max(a, b uint8) uint8 {
   536  	if a > b {
   537  		return a
   538  	}
   539  
   540  	return b
   541  }
   542  
   543  func (p *partialResult4ApproxCountDistinct) bufSize() uint32 {
   544  	return uint32(1) << p.sizeDegree
   545  }
   546  
   547  func (p *partialResult4ApproxCountDistinct) mask() uint32 {
   548  	return p.bufSize() - 1
   549  }
   550  
   551  func (p *partialResult4ApproxCountDistinct) place(x approxCountDistinctHashValue) uint32 {
   552  	return uint32(x>>uniquesHashBitsForSkip) & p.mask()
   553  }
   554  
   555  // Increase the size of the buffer 2 times or up to new size degree.
   556  func (p *partialResult4ApproxCountDistinct) resize(newSizeDegree uint8) {
   557  	oldSize := p.bufSize()
   558  	oldBuf := p.buf
   559  
   560  	if 0 == newSizeDegree {
   561  		newSizeDegree = p.sizeDegree + 1
   562  	}
   563  
   564  	p.buf = make([]approxCountDistinctHashValue, uint32(1)<<newSizeDegree)
   565  	p.sizeDegree = newSizeDegree
   566  
   567  	// Move some items to new locations.
   568  	for i := uint32(0); i < oldSize; i++ {
   569  		x := oldBuf[i]
   570  		if x != 0 {
   571  			p.reinsertImpl(x)
   572  		}
   573  	}
   574  }
   575  
   576  func (p *partialResult4ApproxCountDistinct) readAndMerge(rb []byte) error {
   577  	rhsSkiFIDelegree := rb[0]
   578  	rb = rb[1:]
   579  
   580  	if rhsSkiFIDelegree > p.skiFIDelegree {
   581  		p.skiFIDelegree = rhsSkiFIDelegree
   582  		p.rehash()
   583  	}
   584  
   585  	rb, rhsSize, err := codec.DecodeUvarint(rb)
   586  
   587  	if err != nil {
   588  		return err
   589  	}
   590  
   591  	if rhsSize > uint64(uniquesHashMaxSize) {
   592  		return errors.New("Cannot read partialResult4ApproxCountDistinct: too large size degree")
   593  	}
   594  
   595  	if p.bufSize() < uint32(rhsSize) {
   596  		newSizeDegree := max(uniquesHashSetInitialSizeDegree, uint8(math.Log2(float64(rhsSize-1)))+2)
   597  		p.resize(newSizeDegree)
   598  	}
   599  
   600  	for i := uint32(0); i < uint32(rhsSize); i++ {
   601  		x := *(*approxCountDistinctHashValue)(unsafe.Pointer(&rb[0]))
   602  		rb = rb[4:]
   603  		p.insertHash(x)
   604  	}
   605  
   606  	return err
   607  }
   608  
   609  // Correct system errors due to defCauslisions during hashing in uint32.
   610  func (p *partialResult4ApproxCountDistinct) fixedSize() uint64 {
   611  	if 0 == p.skiFIDelegree {
   612  		return uint64(p.size)
   613  	}
   614  
   615  	res := uint64(p.size) * (uint64(1) << p.skiFIDelegree)
   616  
   617  	// Pseudo-random remainder.
   618  	res += intHash64(uint64(p.size)) & ((uint64(1) << p.skiFIDelegree) - 1)
   619  
   620  	// When different elements randomly scattered across 2^32 buckets, filled buckets with average of `res` obtained.
   621  	p32 := uint64(1) << 32
   622  	fixedRes := math.Round(float64(p32) * (math.Log(float64(p32)) - math.Log(float64(p32-res))))
   623  	return uint64(fixedRes)
   624  }
   625  
   626  func (p *partialResult4ApproxCountDistinct) insertHash(hashValue approxCountDistinctHashValue) {
   627  	if !p.good(hashValue) {
   628  		return
   629  	}
   630  
   631  	p.insertImpl(hashValue)
   632  	p.shrinkIfNeed()
   633  }
   634  
   635  // The value is divided by 2 ^ skip_degree
   636  func (p *partialResult4ApproxCountDistinct) good(hash approxCountDistinctHashValue) bool {
   637  	return hash == ((hash >> p.skiFIDelegree) << p.skiFIDelegree)
   638  }
   639  
   640  // Insert a value
   641  func (p *partialResult4ApproxCountDistinct) insertImpl(x approxCountDistinctHashValue) {
   642  	if x == 0 {
   643  		if !p.hasZero {
   644  			p.size += 1
   645  		}
   646  		p.hasZero = true
   647  		return
   648  	}
   649  
   650  	placeValue := p.place(x)
   651  	for p.buf[placeValue] != 0 && p.buf[placeValue] != x {
   652  		placeValue++
   653  		placeValue &= p.mask()
   654  	}
   655  
   656  	if p.buf[placeValue] == x {
   657  		return
   658  	}
   659  
   660  	p.buf[placeValue] = x
   661  	p.size++
   662  }
   663  
   664  // If the hash causet is full enough, then do resize.
   665  // If there are too many items, then throw half the pieces until they are small enough.
   666  func (p *partialResult4ApproxCountDistinct) shrinkIfNeed() {
   667  	if p.size > p.maxFill() {
   668  		if p.size > uniquesHashMaxSize {
   669  			for p.size > uniquesHashMaxSize {
   670  				p.skiFIDelegree++
   671  				p.rehash()
   672  			}
   673  		} else {
   674  			p.resize(0)
   675  		}
   676  	}
   677  }
   678  
   679  func (p *partialResult4ApproxCountDistinct) maxFill() uint32 {
   680  	return uint32(1) << (p.sizeDegree - 1)
   681  }
   682  
   683  // Delete all values whose hashes do not divide by 2 ^ skip_degree
   684  func (p *partialResult4ApproxCountDistinct) rehash() {
   685  	for i := uint32(0); i < p.bufSize(); i++ {
   686  		if p.buf[i] != 0 && !p.good(p.buf[i]) {
   687  			p.buf[i] = 0
   688  			p.size--
   689  		}
   690  	}
   691  
   692  	for i := uint32(0); i < p.bufSize(); i++ {
   693  		if p.buf[i] != 0 && i != p.place(p.buf[i]) {
   694  			x := p.buf[i]
   695  			p.buf[i] = 0
   696  			p.reinsertImpl(x)
   697  		}
   698  	}
   699  }
   700  
   701  // Insert a value into the new buffer that was in the old buffer.
   702  // Used when increasing the size of the buffer, as well as when reading from a file.
   703  func (p *partialResult4ApproxCountDistinct) reinsertImpl(x approxCountDistinctHashValue) {
   704  	placeValue := p.place(x)
   705  	for p.buf[placeValue] != 0 {
   706  		placeValue++
   707  		placeValue &= p.mask()
   708  	}
   709  
   710  	p.buf[placeValue] = x
   711  }
   712  
   713  func (p *partialResult4ApproxCountDistinct) merge(tar *partialResult4ApproxCountDistinct) {
   714  	if tar.skiFIDelegree > p.skiFIDelegree {
   715  		p.skiFIDelegree = tar.skiFIDelegree
   716  		p.rehash()
   717  	}
   718  
   719  	if !p.hasZero && tar.hasZero {
   720  		p.hasZero = true
   721  		p.size++
   722  		p.shrinkIfNeed()
   723  	}
   724  
   725  	for i := uint32(0); i < tar.bufSize(); i++ {
   726  		if tar.buf[i] != 0 && p.good(tar.buf[i]) {
   727  			p.insertImpl(tar.buf[i])
   728  			p.shrinkIfNeed()
   729  		}
   730  	}
   731  }
   732  
   733  func (p *partialResult4ApproxCountDistinct) Serialize() []byte {
   734  	var buf [4]byte
   735  	res := make([]byte, 0, 1+binary.MaxVarintLen64+p.size*4)
   736  
   737  	res = append(res, p.skiFIDelegree)
   738  	res = codec.EncodeUvarint(res, uint64(p.size))
   739  
   740  	if p.hasZero {
   741  		binary.LittleEndian.PutUint32(buf[:], 0)
   742  		res = append(res, buf[:]...)
   743  	}
   744  
   745  	for i := uint32(0); i < p.bufSize(); i++ {
   746  		if p.buf[i] != 0 {
   747  			binary.LittleEndian.PutUint32(buf[:], uint32(p.buf[i]))
   748  			res = append(res, buf[:]...)
   749  		}
   750  	}
   751  	return res
   752  }
   753  
   754  func (e *baseApproxCountDistinct) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   755  	p := (*partialResult4ApproxCountDistinct)(pr)
   756  	chk.AppendInt64(e.ordinal, int64(p.fixedSize()))
   757  	return nil
   758  }
   759  
   760  func (e *baseApproxCountDistinct) AllocPartialResult() (pr PartialResult, memDelta int64) {
   761  	return (PartialResult)(NewPartialResult4ApproxCountDistinct()), DefPartialResult4ApproxCountDistinctSize
   762  }
   763  
   764  func (e *baseApproxCountDistinct) ResetPartialResult(pr PartialResult) {
   765  	p := (*partialResult4ApproxCountDistinct)(pr)
   766  	p.reset()
   767  }
   768  
   769  func (e *baseApproxCountDistinct) MergePartialResult(sctx stochastikctx.Context, src, dst PartialResult) (memDelta int64, err error) {
   770  	p1, p2 := (*partialResult4ApproxCountDistinct)(src), (*partialResult4ApproxCountDistinct)(dst)
   771  	p2.merge(p1)
   772  	return 0, nil
   773  }
   774  
   775  type approxCountDistinctOriginal struct {
   776  	baseApproxCountDistinct
   777  }
   778  
   779  func (e *approxCountDistinctOriginal) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   780  	p := (*partialResult4ApproxCountDistinct)(pr)
   781  	encodedBytes := make([]byte, 0)
   782  	// Decimal struct is the biggest type we will use.
   783  	buf := make([]byte, types.MyDecimalStructSize)
   784  
   785  	for _, event := range rowsInGroup {
   786  		var err error
   787  		var hasNull, isNull bool
   788  		encodedBytes = encodedBytes[:0]
   789  
   790  		for i := 0; i < len(e.args) && !hasNull; i++ {
   791  			encodedBytes, isNull, err = evalAndEncode(sctx, e.args[i], event, buf, encodedBytes)
   792  			if err != nil {
   793  				return memDelta, err
   794  			}
   795  			if isNull {
   796  				hasNull = true
   797  				break
   798  			}
   799  		}
   800  		if hasNull {
   801  			continue
   802  		}
   803  		oldMemUsage := p.MemUsage()
   804  		x := farm.Hash64(encodedBytes)
   805  		p.InsertHash64(x)
   806  		newMemUsage := p.MemUsage()
   807  		memDelta += newMemUsage - oldMemUsage
   808  	}
   809  
   810  	return memDelta, nil
   811  }
   812  
   813  type approxCountDistinctPartial1 struct {
   814  	approxCountDistinctOriginal
   815  }
   816  
   817  func (e *approxCountDistinctPartial1) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   818  	p := (*partialResult4ApproxCountDistinct)(pr)
   819  	chk.AppendBytes(e.ordinal, p.Serialize())
   820  	return nil
   821  }
   822  
   823  type approxCountDistinctPartial2 struct {
   824  	approxCountDistinctPartial1
   825  }
   826  
   827  func (e *approxCountDistinctPartial2) UFIDelatePartialResult(sctx stochastikctx.Context, rowsInGroup []chunk.Event, pr PartialResult) (memDelta int64, err error) {
   828  	p := (*partialResult4ApproxCountDistinct)(pr)
   829  	for _, event := range rowsInGroup {
   830  		input, isNull, err := e.args[0].EvalString(sctx, event)
   831  		if err != nil {
   832  			return memDelta, err
   833  		}
   834  
   835  		if isNull {
   836  			continue
   837  		}
   838  
   839  		oldMemUsage := p.MemUsage()
   840  		err = p.readAndMerge(replog.Slice(input))
   841  		if err != nil {
   842  			return memDelta, err
   843  		}
   844  		newMemUsage := p.MemUsage()
   845  		memDelta += newMemUsage - oldMemUsage
   846  	}
   847  	return memDelta, nil
   848  }
   849  
   850  type approxCountDistinctFinal struct {
   851  	approxCountDistinctPartial2
   852  }
   853  
   854  func (e *approxCountDistinctFinal) AppendFinalResult2Chunk(sctx stochastikctx.Context, pr PartialResult, chk *chunk.Chunk) error {
   855  	return e.baseApproxCountDistinct.AppendFinalResult2Chunk(sctx, pr, chk)
   856  }