github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/sample.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statistics 15 16 import ( 17 "context" 18 "sort" 19 "time" 20 21 "github.com/twmb/murmur3" 22 "github.com/whtcorpsinc/BerolinaSQL/allegrosql" 23 "github.com/whtcorpsinc/BerolinaSQL/ast" 24 "github.com/whtcorpsinc/BerolinaSQL/terror" 25 "github.com/whtcorpsinc/errors" 26 "github.com/whtcorpsinc/fidelpb/go-fidelpb" 27 "github.com/whtcorpsinc/milevadb/blockcodec" 28 "github.com/whtcorpsinc/milevadb/ekv" 29 "github.com/whtcorpsinc/milevadb/soliton/chunk" 30 "github.com/whtcorpsinc/milevadb/soliton/collate" 31 "github.com/whtcorpsinc/milevadb/soliton/fastrand" 32 "github.com/whtcorpsinc/milevadb/soliton/sqlexec" 33 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 34 "github.com/whtcorpsinc/milevadb/types" 35 ) 36 37 // SampleItem is an item of sampled column value. 38 type SampleItem struct { 39 // Value is the sampled column value. 40 Value types.Causet 41 // Ordinal is original position of this item in SampleDefCauslector before sorting. This 42 // is used for computing correlation. 43 Ordinal int 44 // Handle is the handle of the sample in its key. 45 // This property is used to calculate Ordinal in fast analyze. 46 Handle ekv.Handle 47 } 48 49 // CopySampleItems returns a deep copy of SampleItem slice. 50 func CopySampleItems(items []*SampleItem) []*SampleItem { 51 n := make([]*SampleItem, len(items)) 52 for i, item := range items { 53 ni := *item 54 n[i] = &ni 55 } 56 return n 57 } 58 59 // SortSampleItems shallow copies and sorts a slice of SampleItem. 60 func SortSampleItems(sc *stmtctx.StatementContext, items []*SampleItem) ([]*SampleItem, error) { 61 sortedItems := make([]*SampleItem, len(items)) 62 copy(sortedItems, items) 63 sorter := sampleItemSorter{items: sortedItems, sc: sc} 64 sort.Sblock(&sorter) 65 return sortedItems, sorter.err 66 } 67 68 type sampleItemSorter struct { 69 items []*SampleItem 70 sc *stmtctx.StatementContext 71 err error 72 } 73 74 func (s *sampleItemSorter) Len() int { 75 return len(s.items) 76 } 77 78 func (s *sampleItemSorter) Less(i, j int) bool { 79 var cmp int 80 cmp, s.err = s.items[i].Value.CompareCauset(s.sc, &s.items[j].Value) 81 if s.err != nil { 82 return true 83 } 84 return cmp < 0 85 } 86 87 func (s *sampleItemSorter) Swap(i, j int) { 88 s.items[i], s.items[j] = s.items[j], s.items[i] 89 } 90 91 // SampleDefCauslector will collect Samples and calculate the count and ndv of an attribute. 92 type SampleDefCauslector struct { 93 Samples []*SampleItem 94 seenValues int64 // seenValues is the current seen values. 95 IsMerger bool 96 NullCount int64 97 Count int64 // Count is the number of non-null rows. 98 MaxSampleSize int64 99 FMSketch *FMSketch 100 CMSketch *CMSketch 101 TotalSize int64 // TotalSize is the total size of column. 102 } 103 104 // MergeSampleDefCauslector merges two sample collectors. 105 func (c *SampleDefCauslector) MergeSampleDefCauslector(sc *stmtctx.StatementContext, rc *SampleDefCauslector) { 106 c.NullCount += rc.NullCount 107 c.Count += rc.Count 108 c.TotalSize += rc.TotalSize 109 c.FMSketch.mergeFMSketch(rc.FMSketch) 110 if rc.CMSketch != nil { 111 err := c.CMSketch.MergeCMSketch(rc.CMSketch, 0) 112 terror.Log(errors.Trace(err)) 113 } 114 for _, item := range rc.Samples { 115 err := c.collect(sc, item.Value) 116 terror.Log(errors.Trace(err)) 117 } 118 } 119 120 // SampleDefCauslectorToProto converts SampleDefCauslector to its protobuf representation. 121 func SampleDefCauslectorToProto(c *SampleDefCauslector) *fidelpb.SampleDefCauslector { 122 collector := &fidelpb.SampleDefCauslector{ 123 NullCount: c.NullCount, 124 Count: c.Count, 125 FmSketch: FMSketchToProto(c.FMSketch), 126 TotalSize: &c.TotalSize, 127 } 128 if c.CMSketch != nil { 129 collector.CmSketch = CMSketchToProto(c.CMSketch) 130 } 131 for _, item := range c.Samples { 132 collector.Samples = append(collector.Samples, item.Value.GetBytes()) 133 } 134 return collector 135 } 136 137 const maxSampleValueLength = allegrosql.MaxFieldVarCharLength / 2 138 139 // SampleDefCauslectorFromProto converts SampleDefCauslector from its protobuf representation. 140 func SampleDefCauslectorFromProto(collector *fidelpb.SampleDefCauslector) *SampleDefCauslector { 141 s := &SampleDefCauslector{ 142 NullCount: collector.NullCount, 143 Count: collector.Count, 144 FMSketch: FMSketchFromProto(collector.FmSketch), 145 } 146 if collector.TotalSize != nil { 147 s.TotalSize = *collector.TotalSize 148 } 149 s.CMSketch = CMSketchFromProto(collector.CmSketch) 150 for _, val := range collector.Samples { 151 // When causetstore the histogram bucket boundaries to ekv, we need to limit the length of the value. 152 if len(val) <= maxSampleValueLength { 153 item := &SampleItem{Value: types.NewBytesCauset(val)} 154 s.Samples = append(s.Samples, item) 155 } 156 } 157 return s 158 } 159 160 func (c *SampleDefCauslector) collect(sc *stmtctx.StatementContext, d types.Causet) error { 161 if !c.IsMerger { 162 if d.IsNull() { 163 c.NullCount++ 164 return nil 165 } 166 c.Count++ 167 if err := c.FMSketch.InsertValue(sc, d); err != nil { 168 return errors.Trace(err) 169 } 170 if c.CMSketch != nil { 171 c.CMSketch.InsertBytes(d.GetBytes()) 172 } 173 // Minus one is to remove the flag byte. 174 c.TotalSize += int64(len(d.GetBytes()) - 1) 175 } 176 c.seenValues++ 177 // The following code use types.CloneCauset(d) because d may have a deep reference 178 // to the underlying slice, GC can't free them which lead to memory leak eventually. 179 // TODO: Refactor the proto to avoid copying here. 180 if len(c.Samples) < int(c.MaxSampleSize) { 181 newItem := &SampleItem{} 182 d.Copy(&newItem.Value) 183 c.Samples = append(c.Samples, newItem) 184 } else { 185 shouldAdd := int64(fastrand.Uint64N(uint64(c.seenValues))) < c.MaxSampleSize 186 if shouldAdd { 187 idx := int(fastrand.Uint32N(uint32(c.MaxSampleSize))) 188 newItem := &SampleItem{} 189 d.Copy(&newItem.Value) 190 // To keep the order of the elements, we use delete and append, not direct rememristed. 191 c.Samples = append(c.Samples[:idx], c.Samples[idx+1:]...) 192 c.Samples = append(c.Samples, newItem) 193 } 194 } 195 return nil 196 } 197 198 // CalcTotalSize is to calculate total size based on samples. 199 func (c *SampleDefCauslector) CalcTotalSize() { 200 c.TotalSize = 0 201 for _, item := range c.Samples { 202 c.TotalSize += int64(len(item.Value.GetBytes())) 203 } 204 } 205 206 // SampleBuilder is used to build samples for columns. 207 // Also, if primary key is handle, it will directly build histogram for it. 208 type SampleBuilder struct { 209 Sc *stmtctx.StatementContext 210 RecordSet sqlexec.RecordSet 211 DefCausLen int // DefCausLen is the number of columns need to be sampled. 212 PkBuilder *SortedBuilder 213 MaxBucketSize int64 214 MaxSampleSize int64 215 MaxFMSketchSize int64 216 CMSketchDepth int32 217 CMSketchWidth int32 218 DefCauslators []collate.DefCauslator 219 DefCaussFieldType []*types.FieldType 220 } 221 222 // DefCauslectDeferredCausetStats collects sample from the result set using Reservoir Sampling algorithm, 223 // and estimates NDVs using FM Sketch during the collecting process. 224 // It returns the sample collectors which contain total count, null count, distinct values count and CM Sketch. 225 // It also returns the statistic builder for PK which contains the histogram. 226 // See https://en.wikipedia.org/wiki/Reservoir_sampling 227 func (s SampleBuilder) DefCauslectDeferredCausetStats() ([]*SampleDefCauslector, *SortedBuilder, error) { 228 collectors := make([]*SampleDefCauslector, s.DefCausLen) 229 for i := range collectors { 230 collectors[i] = &SampleDefCauslector{ 231 MaxSampleSize: s.MaxSampleSize, 232 FMSketch: NewFMSketch(int(s.MaxFMSketchSize)), 233 } 234 } 235 if s.CMSketchDepth > 0 && s.CMSketchWidth > 0 { 236 for i := range collectors { 237 collectors[i].CMSketch = NewCMSketch(s.CMSketchDepth, s.CMSketchWidth) 238 } 239 } 240 ctx := context.TODO() 241 req := s.RecordSet.NewChunk() 242 it := chunk.NewIterator4Chunk(req) 243 for { 244 err := s.RecordSet.Next(ctx, req) 245 if err != nil { 246 return nil, nil, errors.Trace(err) 247 } 248 if req.NumRows() == 0 { 249 return collectors, s.PkBuilder, nil 250 } 251 if len(s.RecordSet.Fields()) == 0 { 252 return nil, nil, errors.Errorf("collect column stats failed: record set has 0 field") 253 } 254 for event := it.Begin(); event != it.End(); event = it.Next() { 255 datums := RowToCausets(event, s.RecordSet.Fields()) 256 if s.PkBuilder != nil { 257 err = s.PkBuilder.Iterate(datums[0]) 258 if err != nil { 259 return nil, nil, errors.Trace(err) 260 } 261 datums = datums[1:] 262 } 263 for i, val := range datums { 264 if s.DefCauslators[i] != nil && !val.IsNull() { 265 decodedVal, err := blockcodec.DecodeDeferredCausetValue(val.GetBytes(), s.DefCaussFieldType[i], s.Sc.TimeZone) 266 if err != nil { 267 return nil, nil, err 268 } 269 decodedVal.SetBytesAsString(s.DefCauslators[i].Key(decodedVal.GetString()), decodedVal.DefCauslation(), uint32(decodedVal.Length())) 270 encodedKey, err := blockcodec.EncodeValue(s.Sc, nil, decodedVal) 271 if err != nil { 272 return nil, nil, err 273 } 274 val.SetBytes(encodedKey) 275 } 276 err = collectors[i].collect(s.Sc, val) 277 if err != nil { 278 return nil, nil, errors.Trace(err) 279 } 280 } 281 } 282 } 283 } 284 285 // RowToCausets converts event to causet slice. 286 func RowToCausets(event chunk.Row, fields []*ast.ResultField) []types.Causet { 287 datums := make([]types.Causet, len(fields)) 288 for i, f := range fields { 289 datums[i] = event.GetCauset(i, &f.DeferredCauset.FieldType) 290 } 291 return datums 292 } 293 294 // ExtractTopN extracts the topn from the CM Sketch. 295 func (c *SampleDefCauslector) ExtractTopN(numTop uint32, sc *stmtctx.StatementContext, tp *types.FieldType, timeZone *time.Location) error { 296 if numTop == 0 { 297 return nil 298 } 299 values := make([][]byte, 0, len(c.Samples)) 300 for _, sample := range c.Samples { 301 values = append(values, sample.Value.GetBytes()) 302 } 303 helper := newTopNHelper(values, numTop) 304 cms := c.CMSketch 305 cms.topN = make(map[uint64][]*TopNMeta, helper.actualNumTop) 306 // Process them decreasingly so we can handle most frequent values first and reduce the probability of hash collision 307 // by small values. 308 for i := uint32(0); i < helper.actualNumTop; i++ { 309 h1, h2 := murmur3.Sum128(helper.sorted[i].data) 310 realCnt := cms.queryHashValue(h1, h2) 311 // Because the encode of topn is the new encode type. But analyze proto returns the old encode type for a sample causet, 312 // we should decode it and re-encode it to get the correct bytes. 313 d, err := blockcodec.DecodeDeferredCausetValue(helper.sorted[i].data, tp, timeZone) 314 if err != nil { 315 return err 316 } 317 data, err := blockcodec.EncodeValue(sc, nil, d) 318 if err != nil { 319 return err 320 } 321 cms.subValue(h1, h2, realCnt) 322 cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, data, realCnt}) 323 } 324 return nil 325 }