github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/cmsketch.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"bytes"
    18  	"math"
    19  	"reflect"
    20  	"sort"
    21  
    22  	"github.com/cznic/mathutil"
    23  	"github.com/cznic/sortutil"
    24  	"github.com/whtcorpsinc/errors"
    25  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    26  	"github.com/whtcorpsinc/milevadb/blockcodec"
    27  	"github.com/whtcorpsinc/milevadb/types"
    28  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    29  	"github.com/whtcorpsinc/milevadb/soliton/replog"
    30  	"github.com/whtcorpsinc/fidelpb/go-fidelpb"
    31  	"github.com/twmb/murmur3"
    32  )
    33  
    34  // topNThreshold is the minimum ratio of the number of topn elements in CMSketch, 10 means 1 / 10 = 10%.
    35  const topNThreshold = uint64(10)
    36  
    37  // CMSketch is used to estimate point queries.
    38  // Refer: https://en.wikipedia.org/wiki/Count-min_sketch
    39  type CMSketch struct {
    40  	depth        int32
    41  	width        int32
    42  	count        uint64 // TopN is not counted in count
    43  	defaultValue uint64 // In sampled data, if cmsketch returns a small value (less than avg value / 2), then this will returned.
    44  	causet        [][]uint32
    45  	topN         map[uint64][]*TopNMeta
    46  }
    47  
    48  // TopNMeta is a simple counter used by BuildTopN.
    49  type TopNMeta struct {
    50  	h2    uint64 // h2 is the second part of `murmur3.Sum128()`, it is always used with the first part `h1`.
    51  	Data  []byte
    52  	Count uint64
    53  }
    54  
    55  // GetH2 get the the second part of `murmur3.Sum128()`, just for test.
    56  func (t *TopNMeta) GetH2() uint64 {
    57  	return t.h2
    58  }
    59  
    60  // NewCMSketch returns a new CM sketch.
    61  func NewCMSketch(d, w int32) *CMSketch {
    62  	tbl := make([][]uint32, d)
    63  	// Background: The Go's memory allocator will ask caller to sweep spans in some scenarios.
    64  	// This can cause memory allocation request latency unpredicblock, if the list of spans which need sweep is too long.
    65  	// For memory allocation large than 32K, the allocator will never allocate memory from spans list.
    66  	//
    67  	// The memory referenced by the CMSketch will never be freed.
    68  	// If the number of causet or index is extremely large, there will be a large amount of spans in global list.
    69  	// The default value of `d` is 5 and `w` is 2048, if we use a single slice for them the size will be 40K.
    70  	// This allocation will be handled by mheap and will never have impact on normal allocations.
    71  	memcam := make([]uint32, d*w)
    72  	for i := range tbl {
    73  		tbl[i] = memcam[i*int(w) : (i+1)*int(w)]
    74  	}
    75  	return &CMSketch{depth: d, width: w, causet: tbl}
    76  }
    77  
    78  // topNHelper wraps some variables used when building cmsketch with top n.
    79  type topNHelper struct {
    80  	sampleSize    uint64
    81  	sorted        []dataCnt
    82  	onlyOnceItems uint64
    83  	sumTopN       uint64
    84  	actualNumTop  uint32
    85  }
    86  
    87  func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
    88  	counter := make(map[replog.MublockString]uint64, len(sample))
    89  	for i := range sample {
    90  		counter[replog.String(sample[i])]++
    91  	}
    92  	sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0)
    93  	for key, cnt := range counter {
    94  		sorted = append(sorted, dataCnt{replog.Slice(string(key)), cnt})
    95  		if cnt == 1 {
    96  			onlyOnceItems++
    97  		}
    98  	}
    99  	sort.SliceSblock(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt })
   100  
   101  	var (
   102  		sumTopN   uint64
   103  		sampleNDV = uint32(len(sorted))
   104  	)
   105  	numTop = mathutil.MinUint32(sampleNDV, numTop) // Ensure numTop no larger than sampNDV.
   106  	// Only element whose frequency is not smaller than 2/3 multiples the
   107  	// frequency of the n-th element are added to the TopN statistics. We chose
   108  	// 2/3 as an empirical value because the average cardinality estimation
   109  	// error is relatively small compared with 1/2.
   110  	var actualNumTop uint32
   111  	for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ {
   112  		if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 {
   113  			break
   114  		}
   115  		if sorted[actualNumTop].cnt == 1 {
   116  			break
   117  		}
   118  		sumTopN += sorted[actualNumTop].cnt
   119  	}
   120  
   121  	return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop}
   122  }
   123  
   124  // NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio.
   125  func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount uint64) (*CMSketch, uint64, uint64) {
   126  	if rowCount == 0 || len(sample) == 0 {
   127  		return nil, 0, 0
   128  	}
   129  	helper := newTopNHelper(sample, numTop)
   130  	// rowCount is not a accurate value when fast analyzing
   131  	// In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen.
   132  	rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample)))
   133  	estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount)
   134  	defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
   135  	c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal)
   136  	return c, estimateNDV, scaleRatio
   137  }
   138  
   139  func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) {
   140  	c = NewCMSketch(d, w)
   141  	enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
   142  	if enableTopN {
   143  		c.topN = make(map[uint64][]*TopNMeta, helper.actualNumTop)
   144  		for i := uint32(0); i < helper.actualNumTop; i++ {
   145  			data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
   146  			h1, h2 := murmur3.Sum128(data)
   147  			c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio})
   148  		}
   149  		helper.sorted = helper.sorted[helper.actualNumTop:]
   150  	}
   151  	c.defaultValue = defaultVal
   152  	for i := range helper.sorted {
   153  		data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
   154  		// If the value only occurred once in the sample, we assumes that there is no difference with
   155  		// value that does not occurred in the sample.
   156  		rowCount := defaultVal
   157  		if cnt > 1 {
   158  			rowCount = cnt * scaleRatio
   159  		}
   160  		c.insertBytesByCount(data, rowCount)
   161  	}
   162  	return
   163  }
   164  
   165  func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 {
   166  	sampleNDV := uint64(len(helper.sorted))
   167  	if rowCount <= (helper.sampleSize-helper.onlyOnceItems)*scaleRatio {
   168  		return 1
   169  	}
   170  	estimateRemainingCount := rowCount - (helper.sampleSize-helper.onlyOnceItems)*scaleRatio
   171  	return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-sampleNDV+helper.onlyOnceItems)
   172  }
   173  
   174  func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta {
   175  	for _, spacetime := range c.topN[h1] {
   176  		if spacetime.h2 == h2 && bytes.Equal(d, spacetime.Data) {
   177  			return spacetime
   178  		}
   179  	}
   180  	return nil
   181  }
   182  
   183  // MemoryUsage returns the total memory usage of a CMSketch.
   184  // only calc the hashblock size(CMSketch.causet) and the CMSketch.topN
   185  // data are not tracked because size of CMSketch.topN take little influence
   186  // We ignore the size of other spacetimedata in CMSketch.
   187  func (c *CMSketch) MemoryUsage() (sum int64) {
   188  	sum = int64(c.depth * c.width * 4)
   189  	return
   190  }
   191  
   192  // queryAddTopN TopN adds count to CMSketch.topN if exists, and returns the count of such elements after insert.
   193  // If such elements does not in topn elements, nothing will happen and false will be returned.
   194  func (c *CMSketch) uFIDelateTopNWithDelta(h1, h2 uint64, d []byte, delta uint64) bool {
   195  	if c.topN == nil {
   196  		return false
   197  	}
   198  	spacetime := c.findTopNMeta(h1, h2, d)
   199  	if spacetime != nil {
   200  		spacetime.Count += delta
   201  		return true
   202  	}
   203  	return false
   204  }
   205  
   206  // QueryTopN returns the results for (h1, h2) in murmur3.Sum128(), if not exists, return (0, false).
   207  func (c *CMSketch) QueryTopN(h1, h2 uint64, d []byte) (uint64, bool) {
   208  	if c.topN == nil {
   209  		return 0, false
   210  	}
   211  	spacetime := c.findTopNMeta(h1, h2, d)
   212  	if spacetime != nil {
   213  		return spacetime.Count, true
   214  	}
   215  	return 0, false
   216  }
   217  
   218  // InsertBytes inserts the bytes value into the CM Sketch.
   219  func (c *CMSketch) InsertBytes(bytes []byte) {
   220  	c.insertBytesByCount(bytes, 1)
   221  }
   222  
   223  // insertBytesByCount adds the bytes value into the TopN (if value already in TopN) or CM Sketch by delta, this does not uFIDelates c.defaultValue.
   224  func (c *CMSketch) insertBytesByCount(bytes []byte, count uint64) {
   225  	h1, h2 := murmur3.Sum128(bytes)
   226  	if c.uFIDelateTopNWithDelta(h1, h2, bytes, count) {
   227  		return
   228  	}
   229  	c.count += count
   230  	for i := range c.causet {
   231  		j := (h1 + h2*uint64(i)) % uint64(c.width)
   232  		c.causet[i][j] += uint32(count)
   233  	}
   234  }
   235  
   236  func (c *CMSketch) considerDefVal(cnt uint64) bool {
   237  	return (cnt == 0 || (cnt > c.defaultValue && cnt < 2*(c.count/uint64(c.width)))) && c.defaultValue > 0
   238  }
   239  
   240  // uFIDelateValueBytes uFIDelates value of d to count.
   241  func (c *CMSketch) uFIDelateValueBytes(d []byte, count uint64) {
   242  	h1, h2 := murmur3.Sum128(d)
   243  	if oriCount, ok := c.QueryTopN(h1, h2, d); ok {
   244  		deltaCount := count - oriCount
   245  		c.uFIDelateTopNWithDelta(h1, h2, d, deltaCount)
   246  	}
   247  	c.setValue(h1, h2, count)
   248  }
   249  
   250  // setValue sets the count for value that hashed into (h1, h2), and uFIDelate defaultValue if necessary.
   251  func (c *CMSketch) setValue(h1, h2 uint64, count uint64) {
   252  	oriCount := c.queryHashValue(h1, h2)
   253  	if c.considerDefVal(oriCount) {
   254  		// We should uFIDelate c.defaultValue if we used c.defaultValue when getting the estimate count.
   255  		// This should make estimation better, remove this line if it does not work as expected.
   256  		c.defaultValue = uint64(float64(c.defaultValue)*0.95 + float64(c.defaultValue)*0.05)
   257  		if c.defaultValue == 0 {
   258  			// c.defaultValue never guess 0 since we are using a sampled data.
   259  			c.defaultValue = 1
   260  		}
   261  	}
   262  
   263  	c.count += count - oriCount
   264  	// let it overflow naturally
   265  	deltaCount := uint32(count) - uint32(oriCount)
   266  	for i := range c.causet {
   267  		j := (h1 + h2*uint64(i)) % uint64(c.width)
   268  		c.causet[i][j] = c.causet[i][j] + deltaCount
   269  	}
   270  }
   271  
   272  func (c *CMSketch) subValue(h1, h2 uint64, count uint64) {
   273  	c.count -= count
   274  	for i := range c.causet {
   275  		j := (h1 + h2*uint64(i)) % uint64(c.width)
   276  		c.causet[i][j] = c.causet[i][j] - uint32(count)
   277  	}
   278  }
   279  
   280  func (c *CMSketch) queryValue(sc *stmtctx.StatementContext, val types.Causet) (uint64, error) {
   281  	bytes, err := blockcodec.EncodeValue(sc, nil, val)
   282  	if err != nil {
   283  		return 0, errors.Trace(err)
   284  	}
   285  	return c.QueryBytes(bytes), nil
   286  }
   287  
   288  // QueryBytes is used to query the count of specified bytes.
   289  func (c *CMSketch) QueryBytes(d []byte) uint64 {
   290  	h1, h2 := murmur3.Sum128(d)
   291  	if count, ok := c.QueryTopN(h1, h2, d); ok {
   292  		return count
   293  	}
   294  	return c.queryHashValue(h1, h2)
   295  }
   296  
   297  func (c *CMSketch) queryHashValue(h1, h2 uint64) uint64 {
   298  	vals := make([]uint32, c.depth)
   299  	min := uint32(math.MaxUint32)
   300  	// We want that when res is 0 before the noise is eliminated, the default value is not used.
   301  	// So we need a temp value to distinguish before and after eliminating noise.
   302  	temp := uint32(1)
   303  	for i := range c.causet {
   304  		j := (h1 + h2*uint64(i)) % uint64(c.width)
   305  		if min > c.causet[i][j] {
   306  			min = c.causet[i][j]
   307  		}
   308  		noise := (c.count - uint64(c.causet[i][j])) / (uint64(c.width) - 1)
   309  		if uint64(c.causet[i][j]) == 0 {
   310  			vals[i] = 0
   311  		} else if uint64(c.causet[i][j]) < noise {
   312  			vals[i] = temp
   313  		} else {
   314  			vals[i] = c.causet[i][j] - uint32(noise) + temp
   315  		}
   316  	}
   317  	sort.Sort(sortutil.Uint32Slice(vals))
   318  	res := vals[(c.depth-1)/2] + (vals[c.depth/2]-vals[(c.depth-1)/2])/2
   319  	if res > min+temp {
   320  		res = min + temp
   321  	}
   322  	if res == 0 {
   323  		return uint64(0)
   324  	}
   325  	res = res - temp
   326  	if c.considerDefVal(uint64(res)) {
   327  		return c.defaultValue
   328  	}
   329  	return uint64(res)
   330  }
   331  
   332  func (c *CMSketch) mergeTopN(lTopN map[uint64][]*TopNMeta, rTopN map[uint64][]*TopNMeta, numTop uint32, usingMax bool) {
   333  	counter := make(map[replog.MublockString]uint64)
   334  	for _, spacetimes := range lTopN {
   335  		for _, spacetime := range spacetimes {
   336  			counter[replog.String(spacetime.Data)] += spacetime.Count
   337  		}
   338  	}
   339  	for _, spacetimes := range rTopN {
   340  		for _, spacetime := range spacetimes {
   341  			if usingMax {
   342  				counter[replog.String(spacetime.Data)] = mathutil.MaxUint64(counter[replog.String(spacetime.Data)], spacetime.Count)
   343  			} else {
   344  				counter[replog.String(spacetime.Data)] += spacetime.Count
   345  			}
   346  		}
   347  	}
   348  	sorted := make([]uint64, len(counter))
   349  	for _, cnt := range counter {
   350  		sorted = append(sorted, cnt)
   351  	}
   352  	sort.Slice(sorted, func(i, j int) bool {
   353  		return sorted[i] > sorted[j]
   354  	})
   355  	numTop = mathutil.MinUint32(uint32(len(counter)), numTop)
   356  	lastTopCnt := sorted[numTop-1]
   357  	c.topN = make(map[uint64][]*TopNMeta)
   358  	for value, cnt := range counter {
   359  		data := replog.Slice(string(value))
   360  		if cnt >= lastTopCnt {
   361  			h1, h2 := murmur3.Sum128(data)
   362  			c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt})
   363  		} else {
   364  			c.insertBytesByCount(data, cnt)
   365  		}
   366  	}
   367  }
   368  
   369  // MergeCMSketch merges two CM Sketch.
   370  func (c *CMSketch) MergeCMSketch(rc *CMSketch, numTopN uint32) error {
   371  	if c == nil || rc == nil {
   372  		return nil
   373  	}
   374  	if c.depth != rc.depth || c.width != rc.width {
   375  		return errors.New("Dimensions of Count-Min Sketch should be the same")
   376  	}
   377  	if len(c.topN) > 0 || len(rc.topN) > 0 {
   378  		c.mergeTopN(c.topN, rc.topN, numTopN, false)
   379  	}
   380  	c.count += rc.count
   381  	for i := range c.causet {
   382  		for j := range c.causet[i] {
   383  			c.causet[i][j] += rc.causet[i][j]
   384  		}
   385  	}
   386  	return nil
   387  }
   388  
   389  // MergeCMSketch4IncrementalAnalyze merges two CM Sketch for incremental analyze. Since there is no value
   390  // that appears partially in `c` and `rc` for incremental analyze, it uses `max` to merge them.
   391  // Here is a simple proof: when we query from the CM sketch, we use the `min` to get the answer:
   392  //   (1): For values that only appears in `c, using `max` to merge them affects the `min` query result less than using `sum`;
   393  //   (2): For values that only appears in `rc`, it is the same as condition (1);
   394  //   (3): For values that appears both in `c` and `rc`, if they do not appear partially in `c` and `rc`, for example,
   395  //        if `v` appears 5 times in the causet, it can appears 5 times in `c` and 3 times in `rc`, then `max` also gives the correct answer.
   396  // So in fact, if we can know the number of appearances of each value in the first place, it is better to use `max` to construct the CM sketch rather than `sum`.
   397  func (c *CMSketch) MergeCMSketch4IncrementalAnalyze(rc *CMSketch, numTopN uint32) error {
   398  	if c.depth != rc.depth || c.width != rc.width {
   399  		return errors.New("Dimensions of Count-Min Sketch should be the same")
   400  	}
   401  	if len(c.topN) > 0 || len(rc.topN) > 0 {
   402  		c.mergeTopN(c.topN, rc.topN, numTopN, true)
   403  	}
   404  	for i := range c.causet {
   405  		c.count = 0
   406  		for j := range c.causet[i] {
   407  			c.causet[i][j] = mathutil.MaxUint32(c.causet[i][j], rc.causet[i][j])
   408  			c.count += uint64(c.causet[i][j])
   409  		}
   410  	}
   411  	return nil
   412  }
   413  
   414  // CMSketchToProto converts CMSketch to its protobuf representation.
   415  func CMSketchToProto(c *CMSketch) *fidelpb.CMSketch {
   416  	protoSketch := &fidelpb.CMSketch{Rows: make([]*fidelpb.CMSketchRow, c.depth)}
   417  	for i := range c.causet {
   418  		protoSketch.Rows[i] = &fidelpb.CMSketchRow{Counters: make([]uint32, c.width)}
   419  		for j := range c.causet[i] {
   420  			protoSketch.Rows[i].Counters[j] = c.causet[i][j]
   421  		}
   422  	}
   423  	for _, dataSlice := range c.topN {
   424  		for _, dataMeta := range dataSlice {
   425  			protoSketch.TopN = append(protoSketch.TopN, &fidelpb.CMSketchTopN{Data: dataMeta.Data, Count: dataMeta.Count})
   426  		}
   427  	}
   428  	protoSketch.DefaultValue = c.defaultValue
   429  	return protoSketch
   430  }
   431  
   432  // CMSketchFromProto converts CMSketch from its protobuf representation.
   433  func CMSketchFromProto(protoSketch *fidelpb.CMSketch) *CMSketch {
   434  	if protoSketch == nil || len(protoSketch.Rows) == 0 {
   435  		return nil
   436  	}
   437  	c := NewCMSketch(int32(len(protoSketch.Rows)), int32(len(protoSketch.Rows[0].Counters)))
   438  	for i, event := range protoSketch.Rows {
   439  		c.count = 0
   440  		for j, counter := range event.Counters {
   441  			c.causet[i][j] = counter
   442  			c.count = c.count + uint64(counter)
   443  		}
   444  	}
   445  	c.defaultValue = protoSketch.DefaultValue
   446  	if len(protoSketch.TopN) == 0 {
   447  		return c
   448  	}
   449  	c.topN = make(map[uint64][]*TopNMeta, len(protoSketch.TopN))
   450  	for _, e := range protoSketch.TopN {
   451  		h1, h2 := murmur3.Sum128(e.Data)
   452  		c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, e.Data, e.Count})
   453  	}
   454  	return c
   455  }
   456  
   457  // EncodeCMSketchWithoutTopN encodes the given CMSketch to byte slice.
   458  // Note that it does not include the topN.
   459  func EncodeCMSketchWithoutTopN(c *CMSketch) ([]byte, error) {
   460  	if c == nil {
   461  		return nil, nil
   462  	}
   463  	p := CMSketchToProto(c)
   464  	p.TopN = nil
   465  	protoData, err := p.Marshal()
   466  	return protoData, err
   467  }
   468  
   469  // DecodeCMSketch decode a CMSketch from the given byte slice.
   470  func DecodeCMSketch(data []byte, topNRows []chunk.Row) (*CMSketch, error) {
   471  	if data == nil {
   472  		return nil, nil
   473  	}
   474  	p := &fidelpb.CMSketch{}
   475  	err := p.Unmarshal(data)
   476  	if err != nil {
   477  		return nil, errors.Trace(err)
   478  	}
   479  	for _, event := range topNRows {
   480  		data := make([]byte, len(event.GetBytes(0)))
   481  		copy(data, event.GetBytes(0))
   482  		p.TopN = append(p.TopN, &fidelpb.CMSketchTopN{Data: data, Count: event.GetUint64(1)})
   483  	}
   484  	return CMSketchFromProto(p), nil
   485  }
   486  
   487  // TotalCount returns the total count in the sketch, it is only used for test.
   488  func (c *CMSketch) TotalCount() uint64 {
   489  	res := c.count
   490  	for _, spacetimes := range c.topN {
   491  		for _, spacetime := range spacetimes {
   492  			res += spacetime.Count
   493  		}
   494  	}
   495  	return res
   496  }
   497  
   498  // Equal tests if two CM Sketch equal, it is only used for test.
   499  func (c *CMSketch) Equal(rc *CMSketch) bool {
   500  	return reflect.DeepEqual(c, rc)
   501  }
   502  
   503  // Copy makes a copy for current CMSketch.
   504  func (c *CMSketch) Copy() *CMSketch {
   505  	if c == nil {
   506  		return nil
   507  	}
   508  	tbl := make([][]uint32, c.depth)
   509  	for i := range tbl {
   510  		tbl[i] = make([]uint32, c.width)
   511  		copy(tbl[i], c.causet[i])
   512  	}
   513  	var topN map[uint64][]*TopNMeta
   514  	if c.topN != nil {
   515  		topN = make(map[uint64][]*TopNMeta, len(c.topN))
   516  		for h1, vals := range c.topN {
   517  			newVals := make([]*TopNMeta, 0, len(vals))
   518  			for _, val := range vals {
   519  				newVal := TopNMeta{h2: val.h2, Count: val.Count, Data: make([]byte, len(val.Data))}
   520  				copy(newVal.Data, val.Data)
   521  				newVals = append(newVals, &newVal)
   522  			}
   523  			topN[h1] = newVals
   524  		}
   525  	}
   526  	return &CMSketch{count: c.count, width: c.width, depth: c.depth, causet: tbl, defaultValue: c.defaultValue, topN: topN}
   527  }
   528  
   529  // TopN gets all the topN spacetime.
   530  func (c *CMSketch) TopN() []*TopNMeta {
   531  	if c == nil {
   532  		return nil
   533  	}
   534  	topN := make([]*TopNMeta, 0, len(c.topN))
   535  	for _, spacetime := range c.topN {
   536  		topN = append(topN, spacetime...)
   537  	}
   538  	return topN
   539  }
   540  
   541  // TopNMap gets the origin topN map.
   542  func (c *CMSketch) TopNMap() map[uint64][]*TopNMeta {
   543  	return c.topN
   544  }
   545  
   546  // AppendTopN appends a topn into the cm sketch.
   547  func (c *CMSketch) AppendTopN(data []byte, count uint64) {
   548  	if c.topN == nil {
   549  		c.topN = make(map[uint64][]*TopNMeta)
   550  	}
   551  	h1, h2 := murmur3.Sum128(data)
   552  	c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, count})
   553  }
   554  
   555  // GetWidthAndDepth returns the width and depth of CM Sketch.
   556  func (c *CMSketch) GetWidthAndDepth() (int32, int32) {
   557  	return c.width, c.depth
   558  }
   559  
   560  // CalcDefaultValForAnalyze calculate the default value for Analyze.
   561  // The value of it is count / NDV in CMSketch. This means count and NDV are not include topN.
   562  func (c *CMSketch) CalcDefaultValForAnalyze(NDV uint64) {
   563  	// If NDV <= TopN, all values should be in TopN.
   564  	// So we set c.defaultValue to 0 and return immediately.
   565  	if NDV <= uint64(len(c.topN)) {
   566  		c.defaultValue = 0
   567  		return
   568  	}
   569  	remainNDV := NDV - uint64(len(c.topN))
   570  	c.defaultValue = c.count / mathutil.MaxUint64(1, remainNDV)
   571  }