github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/fmsketch.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"hash"
    18  
    19  	"github.com/whtcorpsinc/errors"
    20  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    21  	"github.com/whtcorpsinc/milevadb/types"
    22  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    23  	"github.com/whtcorpsinc/fidelpb/go-fidelpb"
    24  	"github.com/twmb/murmur3"
    25  )
    26  
    27  // FMSketch is used to count the number of distinct elements in a set.
    28  type FMSketch struct {
    29  	hashset  map[uint64]bool
    30  	mask     uint64
    31  	maxSize  int
    32  	hashFunc hash.Hash64
    33  }
    34  
    35  // NewFMSketch returns a new FM sketch.
    36  func NewFMSketch(maxSize int) *FMSketch {
    37  	return &FMSketch{
    38  		hashset:  make(map[uint64]bool),
    39  		maxSize:  maxSize,
    40  		hashFunc: murmur3.New64(),
    41  	}
    42  }
    43  
    44  // NDV returns the ndv of the sketch.
    45  func (s *FMSketch) NDV() int64 {
    46  	return int64(s.mask+1) * int64(len(s.hashset))
    47  }
    48  
    49  func (s *FMSketch) insertHashValue(hashVal uint64) {
    50  	if (hashVal & s.mask) != 0 {
    51  		return
    52  	}
    53  	s.hashset[hashVal] = true
    54  	if len(s.hashset) > s.maxSize {
    55  		s.mask = s.mask*2 + 1
    56  		for key := range s.hashset {
    57  			if (key & s.mask) != 0 {
    58  				delete(s.hashset, key)
    59  			}
    60  		}
    61  	}
    62  }
    63  
    64  // InsertValue inserts a value into the FM sketch.
    65  func (s *FMSketch) InsertValue(sc *stmtctx.StatementContext, value types.Causet) error {
    66  	bytes, err := codec.EncodeValue(sc, nil, value)
    67  	if err != nil {
    68  		return errors.Trace(err)
    69  	}
    70  	s.hashFunc.Reset()
    71  	_, err = s.hashFunc.Write(bytes)
    72  	if err != nil {
    73  		return errors.Trace(err)
    74  	}
    75  	s.insertHashValue(s.hashFunc.Sum64())
    76  	return nil
    77  }
    78  
    79  func buildFMSketch(sc *stmtctx.StatementContext, values []types.Causet, maxSize int) (*FMSketch, int64, error) {
    80  	s := NewFMSketch(maxSize)
    81  	for _, value := range values {
    82  		err := s.InsertValue(sc, value)
    83  		if err != nil {
    84  			return nil, 0, errors.Trace(err)
    85  		}
    86  	}
    87  	return s, s.NDV(), nil
    88  }
    89  
    90  func (s *FMSketch) mergeFMSketch(rs *FMSketch) {
    91  	if s.mask < rs.mask {
    92  		s.mask = rs.mask
    93  		for key := range s.hashset {
    94  			if (key & s.mask) != 0 {
    95  				delete(s.hashset, key)
    96  			}
    97  		}
    98  	}
    99  	for key := range rs.hashset {
   100  		s.insertHashValue(key)
   101  	}
   102  }
   103  
   104  // FMSketchToProto converts FMSketch to its protobuf representation.
   105  func FMSketchToProto(s *FMSketch) *fidelpb.FMSketch {
   106  	protoSketch := new(fidelpb.FMSketch)
   107  	protoSketch.Mask = s.mask
   108  	for val := range s.hashset {
   109  		protoSketch.Hashset = append(protoSketch.Hashset, val)
   110  	}
   111  	return protoSketch
   112  }
   113  
   114  // FMSketchFromProto converts FMSketch from its protobuf representation.
   115  func FMSketchFromProto(protoSketch *fidelpb.FMSketch) *FMSketch {
   116  	sketch := &FMSketch{
   117  		hashset: make(map[uint64]bool, len(protoSketch.Hashset)),
   118  		mask:    protoSketch.Mask,
   119  	}
   120  	for _, val := range protoSketch.Hashset {
   121  		sketch.hashset[val] = true
   122  	}
   123  	return sketch
   124  }