github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/cmsketch_test.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"fmt"
    18  	"math"
    19  	"math/rand"
    20  	"time"
    21  
    22  	. "github.com/whtcorpsinc/check"
    23  	"github.com/whtcorpsinc/errors"
    24  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    25  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    26  	"github.com/whtcorpsinc/milevadb/types"
    27  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    28  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    29  	"github.com/twmb/murmur3"
    30  )
    31  
    32  func (c *CMSketch) insert(val *types.Causet) error {
    33  	bytes, err := codec.EncodeValue(nil, nil, *val)
    34  	if err != nil {
    35  		return errors.Trace(err)
    36  	}
    37  	c.InsertBytes(bytes)
    38  	return nil
    39  }
    40  
    41  func prepareCMSWithTopN(d, w int32, vals []*types.Causet, n uint32, total uint64) (*CMSketch, error) {
    42  	data := make([][]byte, 0, len(vals))
    43  	for _, v := range vals {
    44  		bytes, err := codec.EncodeValue(nil, nil, *v)
    45  		if err != nil {
    46  			return nil, errors.Trace(err)
    47  		}
    48  		data = append(data, bytes)
    49  	}
    50  	cms, _, _ := NewCMSketchWithTopN(d, w, data, n, total)
    51  	return cms, nil
    52  }
    53  
    54  // buildCMSketchAndMapWithOffset builds cm sketch using zipf and the generated values starts from `offset`.
    55  func buildCMSketchAndMapWithOffset(d, w int32, seed int64, total, imax uint64, s float64, offset int64) (*CMSketch, map[int64]uint32, error) {
    56  	cms := NewCMSketch(d, w)
    57  	mp := make(map[int64]uint32)
    58  	zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax)
    59  	for i := uint64(0); i < total; i++ {
    60  		val := types.NewIntCauset(int64(zipf.Uint64()) + offset)
    61  		err := cms.insert(&val)
    62  		if err != nil {
    63  			return nil, nil, errors.Trace(err)
    64  		}
    65  		mp[val.GetInt64()]++
    66  	}
    67  	return cms, mp, nil
    68  }
    69  
    70  func buildCMSketchAndMap(d, w int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) {
    71  	return buildCMSketchAndMapWithOffset(d, w, seed, total, imax, s, 0)
    72  }
    73  
    74  func buildCMSketchTopNAndMap(d, w, n, sample int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) {
    75  	mp := make(map[int64]uint32)
    76  	zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax)
    77  	vals := make([]*types.Causet, 0)
    78  	for i := uint64(0); i < total; i++ {
    79  		val := types.NewIntCauset(int64(zipf.Uint64()))
    80  		mp[val.GetInt64()]++
    81  		if i < uint64(sample) {
    82  			vals = append(vals, &val)
    83  		}
    84  	}
    85  	cms, err := prepareCMSWithTopN(d, w, vals, uint32(n), total)
    86  	return cms, mp, err
    87  }
    88  
    89  func averageAbsoluteError(cms *CMSketch, mp map[int64]uint32) (uint64, error) {
    90  	sc := &stmtctx.StatementContext{TimeZone: time.Local}
    91  	var total uint64
    92  	for num, count := range mp {
    93  		estimate, err := cms.queryValue(sc, types.NewIntCauset(num))
    94  		if err != nil {
    95  			return 0, errors.Trace(err)
    96  		}
    97  		var diff uint64
    98  		if uint64(count) > estimate {
    99  			diff = uint64(count) - estimate
   100  		} else {
   101  			diff = estimate - uint64(count)
   102  		}
   103  		total += diff
   104  	}
   105  	return total / uint64(len(mp)), nil
   106  }
   107  
   108  func (s *testStatisticsSuite) TestCMSketch(c *C) {
   109  	tests := []struct {
   110  		zipfFactor float64
   111  		avgError   uint64
   112  	}{
   113  		{
   114  			zipfFactor: 1.1,
   115  			avgError:   3,
   116  		},
   117  		{
   118  			zipfFactor: 2,
   119  			avgError:   24,
   120  		},
   121  		{
   122  			zipfFactor: 3,
   123  			avgError:   63,
   124  		},
   125  	}
   126  	d, w := int32(5), int32(2048)
   127  	total, imax := uint64(100000), uint64(1000000)
   128  	for _, t := range tests {
   129  		lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, t.zipfFactor)
   130  		c.Check(err, IsNil)
   131  		avg, err := averageAbsoluteError(lSketch, lMap)
   132  		c.Assert(err, IsNil)
   133  		c.Check(avg, LessEqual, t.avgError)
   134  
   135  		rSketch, rMap, err := buildCMSketchAndMap(d, w, 1, total, imax, t.zipfFactor)
   136  		c.Check(err, IsNil)
   137  		avg, err = averageAbsoluteError(rSketch, rMap)
   138  		c.Assert(err, IsNil)
   139  		c.Check(avg, LessEqual, t.avgError)
   140  
   141  		err = lSketch.MergeCMSketch(rSketch, 0)
   142  		c.Assert(err, IsNil)
   143  		for val, count := range rMap {
   144  			lMap[val] += count
   145  		}
   146  		avg, err = averageAbsoluteError(lSketch, lMap)
   147  		c.Assert(err, IsNil)
   148  		c.Check(avg, Less, t.avgError*2)
   149  	}
   150  }
   151  
   152  func (s *testStatisticsSuite) TestCMSketchCoding(c *C) {
   153  	lSketch := NewCMSketch(5, 2048)
   154  	lSketch.count = 2048 * math.MaxUint32
   155  	for i := range lSketch.causet {
   156  		for j := range lSketch.causet[i] {
   157  			lSketch.causet[i][j] = math.MaxUint32
   158  		}
   159  	}
   160  	bytes, err := EncodeCMSketchWithoutTopN(lSketch)
   161  	c.Assert(err, IsNil)
   162  	c.Assert(len(bytes), Equals, 61457)
   163  	rSketch, err := DecodeCMSketch(bytes, nil)
   164  	c.Assert(err, IsNil)
   165  	c.Assert(lSketch.Equal(rSketch), IsTrue)
   166  }
   167  
   168  func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
   169  	tests := []struct {
   170  		zipfFactor float64
   171  		avgError   uint64
   172  	}{
   173  		// If no significant most items, TopN may will produce results worse than normal algorithm.
   174  		// The first two tests produces almost same avg.
   175  		{
   176  			zipfFactor: 1.0000001,
   177  			avgError:   30,
   178  		},
   179  		{
   180  			zipfFactor: 1.1,
   181  			avgError:   30,
   182  		},
   183  		{
   184  			zipfFactor: 2,
   185  			avgError:   89,
   186  		},
   187  		// If the most data lies in a narrow range, our guess may have better result.
   188  		// The error mainly comes from huge numbers.
   189  		{
   190  			zipfFactor: 5,
   191  			avgError:   208,
   192  		},
   193  	}
   194  	d, w := int32(5), int32(2048)
   195  	total, imax := uint64(1000000), uint64(1000000)
   196  	for _, t := range tests {
   197  		lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor)
   198  		c.Check(err, IsNil)
   199  		c.Assert(len(lSketch.TopN()), LessEqual, 40)
   200  		avg, err := averageAbsoluteError(lSketch, lMap)
   201  		c.Assert(err, IsNil)
   202  		c.Check(avg, LessEqual, t.avgError)
   203  	}
   204  }
   205  
   206  func (s *testStatisticsSuite) TestMergeCMSketch4IncrementalAnalyze(c *C) {
   207  	tests := []struct {
   208  		zipfFactor float64
   209  		avgError   uint64
   210  	}{
   211  		{
   212  			zipfFactor: 1.0000001,
   213  			avgError:   48,
   214  		},
   215  		{
   216  			zipfFactor: 1.1,
   217  			avgError:   48,
   218  		},
   219  		{
   220  			zipfFactor: 2,
   221  			avgError:   128,
   222  		},
   223  		{
   224  			zipfFactor: 5,
   225  			avgError:   256,
   226  		},
   227  	}
   228  	d, w := int32(5), int32(2048)
   229  	total, imax := uint64(100000), uint64(1000000)
   230  	for _, t := range tests {
   231  		lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, t.zipfFactor)
   232  		c.Check(err, IsNil)
   233  		avg, err := averageAbsoluteError(lSketch, lMap)
   234  		c.Assert(err, IsNil)
   235  		c.Check(avg, LessEqual, t.avgError)
   236  
   237  		rSketch, rMap, err := buildCMSketchAndMapWithOffset(d, w, 1, total, imax, t.zipfFactor, int64(imax))
   238  		c.Check(err, IsNil)
   239  		avg, err = averageAbsoluteError(rSketch, rMap)
   240  		c.Assert(err, IsNil)
   241  		c.Check(avg, LessEqual, t.avgError)
   242  
   243  		for key, val := range rMap {
   244  			lMap[key] += val
   245  		}
   246  		c.Assert(lSketch.MergeCMSketch4IncrementalAnalyze(rSketch, 0), IsNil)
   247  		avg, err = averageAbsoluteError(lSketch, lMap)
   248  		c.Assert(err, IsNil)
   249  		c.Check(avg, LessEqual, t.avgError)
   250  		width, depth := lSketch.GetWidthAndDepth()
   251  		c.Assert(width, Equals, int32(2048))
   252  		c.Assert(depth, Equals, int32(5))
   253  	}
   254  }
   255  
   256  func (s *testStatisticsSuite) TestCMSketchTopNUniqueData(c *C) {
   257  	d, w := int32(5), int32(2048)
   258  	total := uint64(1000000)
   259  	mp := make(map[int64]uint32)
   260  	vals := make([]*types.Causet, 0)
   261  	for i := uint64(0); i < total; i++ {
   262  		val := types.NewIntCauset(int64(i))
   263  		mp[val.GetInt64()]++
   264  		if i < uint64(1000) {
   265  			vals = append(vals, &val)
   266  		}
   267  	}
   268  	cms, err := prepareCMSWithTopN(d, w, vals, uint32(20), total)
   269  	c.Assert(err, IsNil)
   270  	avg, err := averageAbsoluteError(cms, mp)
   271  	c.Assert(err, IsNil)
   272  	c.Check(cms.defaultValue, Equals, uint64(1))
   273  	c.Check(avg, Equals, uint64(0))
   274  	c.Check(len(cms.topN), Equals, 0)
   275  }
   276  
   277  func (s *testStatisticsSuite) TestCMSketchCodingTopN(c *C) {
   278  	lSketch := NewCMSketch(5, 2048)
   279  	lSketch.count = 2048 * (math.MaxUint32)
   280  	for i := range lSketch.causet {
   281  		for j := range lSketch.causet[i] {
   282  			lSketch.causet[i][j] = math.MaxUint32
   283  		}
   284  	}
   285  	lSketch.topN = make(map[uint64][]*TopNMeta)
   286  	unsignedLong := types.NewFieldType(allegrosql.TypeLonglong)
   287  	unsignedLong.Flag |= allegrosql.UnsignedFlag
   288  	chk := chunk.New([]*types.FieldType{types.NewFieldType(allegrosql.TypeBlob), unsignedLong}, 20, 20)
   289  	var rows []chunk.Row
   290  	for i := 0; i < 20; i++ {
   291  		tString := []byte(fmt.Sprintf("%20000d", i))
   292  		h1, h2 := murmur3.Sum128(tString)
   293  		lSketch.topN[h1] = []*TopNMeta{{h2, tString, math.MaxUint64}}
   294  		chk.AppendBytes(0, tString)
   295  		chk.AppendUint64(1, math.MaxUint64)
   296  		rows = append(rows, chk.GetRow(i))
   297  	}
   298  
   299  	bytes, err := EncodeCMSketchWithoutTopN(lSketch)
   300  	c.Assert(err, IsNil)
   301  	c.Assert(len(bytes), Equals, 61457)
   302  	rSketch, err := DecodeCMSketch(bytes, rows)
   303  	c.Assert(err, IsNil)
   304  	c.Assert(lSketch.Equal(rSketch), IsTrue)
   305  	// do not panic
   306  	DecodeCMSketch([]byte{}, rows)
   307  }