github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/cmsketch_test.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "fmt" 18 "math" 19 "math/rand" 20 "time" 21 22 . "github.com/whtcorpsinc/check" 23 "github.com/whtcorpsinc/errors" 24 "github.com/whtcorpsinc/BerolinaSQL/allegrosql" 25 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 26 "github.com/whtcorpsinc/milevadb/types" 27 "github.com/whtcorpsinc/milevadb/soliton/chunk" 28 "github.com/whtcorpsinc/milevadb/soliton/codec" 29 "github.com/twmb/murmur3" 30 ) 31 32 func (c *CMSketch) insert(val *types.Causet) error { 33 bytes, err := codec.EncodeValue(nil, nil, *val) 34 if err != nil { 35 return errors.Trace(err) 36 } 37 c.InsertBytes(bytes) 38 return nil 39 } 40 41 func prepareCMSWithTopN(d, w int32, vals []*types.Causet, n uint32, total uint64) (*CMSketch, error) { 42 data := make([][]byte, 0, len(vals)) 43 for _, v := range vals { 44 bytes, err := codec.EncodeValue(nil, nil, *v) 45 if err != nil { 46 return nil, errors.Trace(err) 47 } 48 data = append(data, bytes) 49 } 50 cms, _, _ := NewCMSketchWithTopN(d, w, data, n, total) 51 return cms, nil 52 } 53 54 // buildCMSketchAndMapWithOffset builds cm sketch using zipf and the generated values starts from `offset`. 55 func buildCMSketchAndMapWithOffset(d, w int32, seed int64, total, imax uint64, s float64, offset int64) (*CMSketch, map[int64]uint32, error) { 56 cms := NewCMSketch(d, w) 57 mp := make(map[int64]uint32) 58 zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax) 59 for i := uint64(0); i < total; i++ { 60 val := types.NewIntCauset(int64(zipf.Uint64()) + offset) 61 err := cms.insert(&val) 62 if err != nil { 63 return nil, nil, errors.Trace(err) 64 } 65 mp[val.GetInt64()]++ 66 } 67 return cms, mp, nil 68 } 69 70 func buildCMSketchAndMap(d, w int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) { 71 return buildCMSketchAndMapWithOffset(d, w, seed, total, imax, s, 0) 72 } 73 74 func buildCMSketchTopNAndMap(d, w, n, sample int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) { 75 mp := make(map[int64]uint32) 76 zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax) 77 vals := make([]*types.Causet, 0) 78 for i := uint64(0); i < total; i++ { 79 val := types.NewIntCauset(int64(zipf.Uint64())) 80 mp[val.GetInt64()]++ 81 if i < uint64(sample) { 82 vals = append(vals, &val) 83 } 84 } 85 cms, err := prepareCMSWithTopN(d, w, vals, uint32(n), total) 86 return cms, mp, err 87 } 88 89 func averageAbsoluteError(cms *CMSketch, mp map[int64]uint32) (uint64, error) { 90 sc := &stmtctx.StatementContext{TimeZone: time.Local} 91 var total uint64 92 for num, count := range mp { 93 estimate, err := cms.queryValue(sc, types.NewIntCauset(num)) 94 if err != nil { 95 return 0, errors.Trace(err) 96 } 97 var diff uint64 98 if uint64(count) > estimate { 99 diff = uint64(count) - estimate 100 } else { 101 diff = estimate - uint64(count) 102 } 103 total += diff 104 } 105 return total / uint64(len(mp)), nil 106 } 107 108 func (s *testStatisticsSuite) TestCMSketch(c *C) { 109 tests := []struct { 110 zipfFactor float64 111 avgError uint64 112 }{ 113 { 114 zipfFactor: 1.1, 115 avgError: 3, 116 }, 117 { 118 zipfFactor: 2, 119 avgError: 24, 120 }, 121 { 122 zipfFactor: 3, 123 avgError: 63, 124 }, 125 } 126 d, w := int32(5), int32(2048) 127 total, imax := uint64(100000), uint64(1000000) 128 for _, t := range tests { 129 lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, t.zipfFactor) 130 c.Check(err, IsNil) 131 avg, err := averageAbsoluteError(lSketch, lMap) 132 c.Assert(err, IsNil) 133 c.Check(avg, LessEqual, t.avgError) 134 135 rSketch, rMap, err := buildCMSketchAndMap(d, w, 1, total, imax, t.zipfFactor) 136 c.Check(err, IsNil) 137 avg, err = averageAbsoluteError(rSketch, rMap) 138 c.Assert(err, IsNil) 139 c.Check(avg, LessEqual, t.avgError) 140 141 err = lSketch.MergeCMSketch(rSketch, 0) 142 c.Assert(err, IsNil) 143 for val, count := range rMap { 144 lMap[val] += count 145 } 146 avg, err = averageAbsoluteError(lSketch, lMap) 147 c.Assert(err, IsNil) 148 c.Check(avg, Less, t.avgError*2) 149 } 150 } 151 152 func (s *testStatisticsSuite) TestCMSketchCoding(c *C) { 153 lSketch := NewCMSketch(5, 2048) 154 lSketch.count = 2048 * math.MaxUint32 155 for i := range lSketch.causet { 156 for j := range lSketch.causet[i] { 157 lSketch.causet[i][j] = math.MaxUint32 158 } 159 } 160 bytes, err := EncodeCMSketchWithoutTopN(lSketch) 161 c.Assert(err, IsNil) 162 c.Assert(len(bytes), Equals, 61457) 163 rSketch, err := DecodeCMSketch(bytes, nil) 164 c.Assert(err, IsNil) 165 c.Assert(lSketch.Equal(rSketch), IsTrue) 166 } 167 168 func (s *testStatisticsSuite) TestCMSketchTopN(c *C) { 169 tests := []struct { 170 zipfFactor float64 171 avgError uint64 172 }{ 173 // If no significant most items, TopN may will produce results worse than normal algorithm. 174 // The first two tests produces almost same avg. 175 { 176 zipfFactor: 1.0000001, 177 avgError: 30, 178 }, 179 { 180 zipfFactor: 1.1, 181 avgError: 30, 182 }, 183 { 184 zipfFactor: 2, 185 avgError: 89, 186 }, 187 // If the most data lies in a narrow range, our guess may have better result. 188 // The error mainly comes from huge numbers. 189 { 190 zipfFactor: 5, 191 avgError: 208, 192 }, 193 } 194 d, w := int32(5), int32(2048) 195 total, imax := uint64(1000000), uint64(1000000) 196 for _, t := range tests { 197 lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor) 198 c.Check(err, IsNil) 199 c.Assert(len(lSketch.TopN()), LessEqual, 40) 200 avg, err := averageAbsoluteError(lSketch, lMap) 201 c.Assert(err, IsNil) 202 c.Check(avg, LessEqual, t.avgError) 203 } 204 } 205 206 func (s *testStatisticsSuite) TestMergeCMSketch4IncrementalAnalyze(c *C) { 207 tests := []struct { 208 zipfFactor float64 209 avgError uint64 210 }{ 211 { 212 zipfFactor: 1.0000001, 213 avgError: 48, 214 }, 215 { 216 zipfFactor: 1.1, 217 avgError: 48, 218 }, 219 { 220 zipfFactor: 2, 221 avgError: 128, 222 }, 223 { 224 zipfFactor: 5, 225 avgError: 256, 226 }, 227 } 228 d, w := int32(5), int32(2048) 229 total, imax := uint64(100000), uint64(1000000) 230 for _, t := range tests { 231 lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, t.zipfFactor) 232 c.Check(err, IsNil) 233 avg, err := averageAbsoluteError(lSketch, lMap) 234 c.Assert(err, IsNil) 235 c.Check(avg, LessEqual, t.avgError) 236 237 rSketch, rMap, err := buildCMSketchAndMapWithOffset(d, w, 1, total, imax, t.zipfFactor, int64(imax)) 238 c.Check(err, IsNil) 239 avg, err = averageAbsoluteError(rSketch, rMap) 240 c.Assert(err, IsNil) 241 c.Check(avg, LessEqual, t.avgError) 242 243 for key, val := range rMap { 244 lMap[key] += val 245 } 246 c.Assert(lSketch.MergeCMSketch4IncrementalAnalyze(rSketch, 0), IsNil) 247 avg, err = averageAbsoluteError(lSketch, lMap) 248 c.Assert(err, IsNil) 249 c.Check(avg, LessEqual, t.avgError) 250 width, depth := lSketch.GetWidthAndDepth() 251 c.Assert(width, Equals, int32(2048)) 252 c.Assert(depth, Equals, int32(5)) 253 } 254 } 255 256 func (s *testStatisticsSuite) TestCMSketchTopNUniqueData(c *C) { 257 d, w := int32(5), int32(2048) 258 total := uint64(1000000) 259 mp := make(map[int64]uint32) 260 vals := make([]*types.Causet, 0) 261 for i := uint64(0); i < total; i++ { 262 val := types.NewIntCauset(int64(i)) 263 mp[val.GetInt64()]++ 264 if i < uint64(1000) { 265 vals = append(vals, &val) 266 } 267 } 268 cms, err := prepareCMSWithTopN(d, w, vals, uint32(20), total) 269 c.Assert(err, IsNil) 270 avg, err := averageAbsoluteError(cms, mp) 271 c.Assert(err, IsNil) 272 c.Check(cms.defaultValue, Equals, uint64(1)) 273 c.Check(avg, Equals, uint64(0)) 274 c.Check(len(cms.topN), Equals, 0) 275 } 276 277 func (s *testStatisticsSuite) TestCMSketchCodingTopN(c *C) { 278 lSketch := NewCMSketch(5, 2048) 279 lSketch.count = 2048 * (math.MaxUint32) 280 for i := range lSketch.causet { 281 for j := range lSketch.causet[i] { 282 lSketch.causet[i][j] = math.MaxUint32 283 } 284 } 285 lSketch.topN = make(map[uint64][]*TopNMeta) 286 unsignedLong := types.NewFieldType(allegrosql.TypeLonglong) 287 unsignedLong.Flag |= allegrosql.UnsignedFlag 288 chk := chunk.New([]*types.FieldType{types.NewFieldType(allegrosql.TypeBlob), unsignedLong}, 20, 20) 289 var rows []chunk.Row 290 for i := 0; i < 20; i++ { 291 tString := []byte(fmt.Sprintf("%20000d", i)) 292 h1, h2 := murmur3.Sum128(tString) 293 lSketch.topN[h1] = []*TopNMeta{{h2, tString, math.MaxUint64}} 294 chk.AppendBytes(0, tString) 295 chk.AppendUint64(1, math.MaxUint64) 296 rows = append(rows, chk.GetRow(i)) 297 } 298 299 bytes, err := EncodeCMSketchWithoutTopN(lSketch) 300 c.Assert(err, IsNil) 301 c.Assert(len(bytes), Equals, 61457) 302 rSketch, err := DecodeCMSketch(bytes, rows) 303 c.Assert(err, IsNil) 304 c.Assert(lSketch.Equal(rSketch), IsTrue) 305 // do not panic 306 DecodeCMSketch([]byte{}, rows) 307 }