github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/statistics/sample.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statistics
    15  
    16  import (
    17  	"context"
    18  	"sort"
    19  	"time"
    20  
    21  	"github.com/twmb/murmur3"
    22  	"github.com/whtcorpsinc/BerolinaSQL/allegrosql"
    23  	"github.com/whtcorpsinc/BerolinaSQL/ast"
    24  	"github.com/whtcorpsinc/BerolinaSQL/terror"
    25  	"github.com/whtcorpsinc/errors"
    26  	"github.com/whtcorpsinc/fidelpb/go-fidelpb"
    27  	"github.com/whtcorpsinc/milevadb/blockcodec"
    28  	"github.com/whtcorpsinc/milevadb/ekv"
    29  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    30  	"github.com/whtcorpsinc/milevadb/soliton/collate"
    31  	"github.com/whtcorpsinc/milevadb/soliton/fastrand"
    32  	"github.com/whtcorpsinc/milevadb/soliton/sqlexec"
    33  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    34  	"github.com/whtcorpsinc/milevadb/types"
    35  )
    36  
    37  // SampleItem is an item of sampled column value.
    38  type SampleItem struct {
    39  	// Value is the sampled column value.
    40  	Value types.Causet
    41  	// Ordinal is original position of this item in SampleDefCauslector before sorting. This
    42  	// is used for computing correlation.
    43  	Ordinal int
    44  	// Handle is the handle of the sample in its key.
    45  	// This property is used to calculate Ordinal in fast analyze.
    46  	Handle ekv.Handle
    47  }
    48  
    49  // CopySampleItems returns a deep copy of SampleItem slice.
    50  func CopySampleItems(items []*SampleItem) []*SampleItem {
    51  	n := make([]*SampleItem, len(items))
    52  	for i, item := range items {
    53  		ni := *item
    54  		n[i] = &ni
    55  	}
    56  	return n
    57  }
    58  
    59  // SortSampleItems shallow copies and sorts a slice of SampleItem.
    60  func SortSampleItems(sc *stmtctx.StatementContext, items []*SampleItem) ([]*SampleItem, error) {
    61  	sortedItems := make([]*SampleItem, len(items))
    62  	copy(sortedItems, items)
    63  	sorter := sampleItemSorter{items: sortedItems, sc: sc}
    64  	sort.Sblock(&sorter)
    65  	return sortedItems, sorter.err
    66  }
    67  
    68  type sampleItemSorter struct {
    69  	items []*SampleItem
    70  	sc    *stmtctx.StatementContext
    71  	err   error
    72  }
    73  
    74  func (s *sampleItemSorter) Len() int {
    75  	return len(s.items)
    76  }
    77  
    78  func (s *sampleItemSorter) Less(i, j int) bool {
    79  	var cmp int
    80  	cmp, s.err = s.items[i].Value.CompareCauset(s.sc, &s.items[j].Value)
    81  	if s.err != nil {
    82  		return true
    83  	}
    84  	return cmp < 0
    85  }
    86  
    87  func (s *sampleItemSorter) Swap(i, j int) {
    88  	s.items[i], s.items[j] = s.items[j], s.items[i]
    89  }
    90  
    91  // SampleDefCauslector will collect Samples and calculate the count and ndv of an attribute.
    92  type SampleDefCauslector struct {
    93  	Samples       []*SampleItem
    94  	seenValues    int64 // seenValues is the current seen values.
    95  	IsMerger      bool
    96  	NullCount     int64
    97  	Count         int64 // Count is the number of non-null rows.
    98  	MaxSampleSize int64
    99  	FMSketch      *FMSketch
   100  	CMSketch      *CMSketch
   101  	TotalSize     int64 // TotalSize is the total size of column.
   102  }
   103  
   104  // MergeSampleDefCauslector merges two sample collectors.
   105  func (c *SampleDefCauslector) MergeSampleDefCauslector(sc *stmtctx.StatementContext, rc *SampleDefCauslector) {
   106  	c.NullCount += rc.NullCount
   107  	c.Count += rc.Count
   108  	c.TotalSize += rc.TotalSize
   109  	c.FMSketch.mergeFMSketch(rc.FMSketch)
   110  	if rc.CMSketch != nil {
   111  		err := c.CMSketch.MergeCMSketch(rc.CMSketch, 0)
   112  		terror.Log(errors.Trace(err))
   113  	}
   114  	for _, item := range rc.Samples {
   115  		err := c.collect(sc, item.Value)
   116  		terror.Log(errors.Trace(err))
   117  	}
   118  }
   119  
   120  // SampleDefCauslectorToProto converts SampleDefCauslector to its protobuf representation.
   121  func SampleDefCauslectorToProto(c *SampleDefCauslector) *fidelpb.SampleDefCauslector {
   122  	collector := &fidelpb.SampleDefCauslector{
   123  		NullCount: c.NullCount,
   124  		Count:     c.Count,
   125  		FmSketch:  FMSketchToProto(c.FMSketch),
   126  		TotalSize: &c.TotalSize,
   127  	}
   128  	if c.CMSketch != nil {
   129  		collector.CmSketch = CMSketchToProto(c.CMSketch)
   130  	}
   131  	for _, item := range c.Samples {
   132  		collector.Samples = append(collector.Samples, item.Value.GetBytes())
   133  	}
   134  	return collector
   135  }
   136  
   137  const maxSampleValueLength = allegrosql.MaxFieldVarCharLength / 2
   138  
   139  // SampleDefCauslectorFromProto converts SampleDefCauslector from its protobuf representation.
   140  func SampleDefCauslectorFromProto(collector *fidelpb.SampleDefCauslector) *SampleDefCauslector {
   141  	s := &SampleDefCauslector{
   142  		NullCount: collector.NullCount,
   143  		Count:     collector.Count,
   144  		FMSketch:  FMSketchFromProto(collector.FmSketch),
   145  	}
   146  	if collector.TotalSize != nil {
   147  		s.TotalSize = *collector.TotalSize
   148  	}
   149  	s.CMSketch = CMSketchFromProto(collector.CmSketch)
   150  	for _, val := range collector.Samples {
   151  		// When causetstore the histogram bucket boundaries to ekv, we need to limit the length of the value.
   152  		if len(val) <= maxSampleValueLength {
   153  			item := &SampleItem{Value: types.NewBytesCauset(val)}
   154  			s.Samples = append(s.Samples, item)
   155  		}
   156  	}
   157  	return s
   158  }
   159  
   160  func (c *SampleDefCauslector) collect(sc *stmtctx.StatementContext, d types.Causet) error {
   161  	if !c.IsMerger {
   162  		if d.IsNull() {
   163  			c.NullCount++
   164  			return nil
   165  		}
   166  		c.Count++
   167  		if err := c.FMSketch.InsertValue(sc, d); err != nil {
   168  			return errors.Trace(err)
   169  		}
   170  		if c.CMSketch != nil {
   171  			c.CMSketch.InsertBytes(d.GetBytes())
   172  		}
   173  		// Minus one is to remove the flag byte.
   174  		c.TotalSize += int64(len(d.GetBytes()) - 1)
   175  	}
   176  	c.seenValues++
   177  	// The following code use types.CloneCauset(d) because d may have a deep reference
   178  	// to the underlying slice, GC can't free them which lead to memory leak eventually.
   179  	// TODO: Refactor the proto to avoid copying here.
   180  	if len(c.Samples) < int(c.MaxSampleSize) {
   181  		newItem := &SampleItem{}
   182  		d.Copy(&newItem.Value)
   183  		c.Samples = append(c.Samples, newItem)
   184  	} else {
   185  		shouldAdd := int64(fastrand.Uint64N(uint64(c.seenValues))) < c.MaxSampleSize
   186  		if shouldAdd {
   187  			idx := int(fastrand.Uint32N(uint32(c.MaxSampleSize)))
   188  			newItem := &SampleItem{}
   189  			d.Copy(&newItem.Value)
   190  			// To keep the order of the elements, we use delete and append, not direct rememristed.
   191  			c.Samples = append(c.Samples[:idx], c.Samples[idx+1:]...)
   192  			c.Samples = append(c.Samples, newItem)
   193  		}
   194  	}
   195  	return nil
   196  }
   197  
   198  // CalcTotalSize is to calculate total size based on samples.
   199  func (c *SampleDefCauslector) CalcTotalSize() {
   200  	c.TotalSize = 0
   201  	for _, item := range c.Samples {
   202  		c.TotalSize += int64(len(item.Value.GetBytes()))
   203  	}
   204  }
   205  
   206  // SampleBuilder is used to build samples for columns.
   207  // Also, if primary key is handle, it will directly build histogram for it.
   208  type SampleBuilder struct {
   209  	Sc                *stmtctx.StatementContext
   210  	RecordSet         sqlexec.RecordSet
   211  	DefCausLen        int // DefCausLen is the number of columns need to be sampled.
   212  	PkBuilder         *SortedBuilder
   213  	MaxBucketSize     int64
   214  	MaxSampleSize     int64
   215  	MaxFMSketchSize   int64
   216  	CMSketchDepth     int32
   217  	CMSketchWidth     int32
   218  	DefCauslators     []collate.DefCauslator
   219  	DefCaussFieldType []*types.FieldType
   220  }
   221  
   222  // DefCauslectDeferredCausetStats collects sample from the result set using Reservoir Sampling algorithm,
   223  // and estimates NDVs using FM Sketch during the collecting process.
   224  // It returns the sample collectors which contain total count, null count, distinct values count and CM Sketch.
   225  // It also returns the statistic builder for PK which contains the histogram.
   226  // See https://en.wikipedia.org/wiki/Reservoir_sampling
   227  func (s SampleBuilder) DefCauslectDeferredCausetStats() ([]*SampleDefCauslector, *SortedBuilder, error) {
   228  	collectors := make([]*SampleDefCauslector, s.DefCausLen)
   229  	for i := range collectors {
   230  		collectors[i] = &SampleDefCauslector{
   231  			MaxSampleSize: s.MaxSampleSize,
   232  			FMSketch:      NewFMSketch(int(s.MaxFMSketchSize)),
   233  		}
   234  	}
   235  	if s.CMSketchDepth > 0 && s.CMSketchWidth > 0 {
   236  		for i := range collectors {
   237  			collectors[i].CMSketch = NewCMSketch(s.CMSketchDepth, s.CMSketchWidth)
   238  		}
   239  	}
   240  	ctx := context.TODO()
   241  	req := s.RecordSet.NewChunk()
   242  	it := chunk.NewIterator4Chunk(req)
   243  	for {
   244  		err := s.RecordSet.Next(ctx, req)
   245  		if err != nil {
   246  			return nil, nil, errors.Trace(err)
   247  		}
   248  		if req.NumRows() == 0 {
   249  			return collectors, s.PkBuilder, nil
   250  		}
   251  		if len(s.RecordSet.Fields()) == 0 {
   252  			return nil, nil, errors.Errorf("collect column stats failed: record set has 0 field")
   253  		}
   254  		for event := it.Begin(); event != it.End(); event = it.Next() {
   255  			datums := RowToCausets(event, s.RecordSet.Fields())
   256  			if s.PkBuilder != nil {
   257  				err = s.PkBuilder.Iterate(datums[0])
   258  				if err != nil {
   259  					return nil, nil, errors.Trace(err)
   260  				}
   261  				datums = datums[1:]
   262  			}
   263  			for i, val := range datums {
   264  				if s.DefCauslators[i] != nil && !val.IsNull() {
   265  					decodedVal, err := blockcodec.DecodeDeferredCausetValue(val.GetBytes(), s.DefCaussFieldType[i], s.Sc.TimeZone)
   266  					if err != nil {
   267  						return nil, nil, err
   268  					}
   269  					decodedVal.SetBytesAsString(s.DefCauslators[i].Key(decodedVal.GetString()), decodedVal.DefCauslation(), uint32(decodedVal.Length()))
   270  					encodedKey, err := blockcodec.EncodeValue(s.Sc, nil, decodedVal)
   271  					if err != nil {
   272  						return nil, nil, err
   273  					}
   274  					val.SetBytes(encodedKey)
   275  				}
   276  				err = collectors[i].collect(s.Sc, val)
   277  				if err != nil {
   278  					return nil, nil, errors.Trace(err)
   279  				}
   280  			}
   281  		}
   282  	}
   283  }
   284  
   285  // RowToCausets converts event to causet slice.
   286  func RowToCausets(event chunk.Row, fields []*ast.ResultField) []types.Causet {
   287  	datums := make([]types.Causet, len(fields))
   288  	for i, f := range fields {
   289  		datums[i] = event.GetCauset(i, &f.DeferredCauset.FieldType)
   290  	}
   291  	return datums
   292  }
   293  
   294  // ExtractTopN extracts the topn from the CM Sketch.
   295  func (c *SampleDefCauslector) ExtractTopN(numTop uint32, sc *stmtctx.StatementContext, tp *types.FieldType, timeZone *time.Location) error {
   296  	if numTop == 0 {
   297  		return nil
   298  	}
   299  	values := make([][]byte, 0, len(c.Samples))
   300  	for _, sample := range c.Samples {
   301  		values = append(values, sample.Value.GetBytes())
   302  	}
   303  	helper := newTopNHelper(values, numTop)
   304  	cms := c.CMSketch
   305  	cms.topN = make(map[uint64][]*TopNMeta, helper.actualNumTop)
   306  	// Process them decreasingly so we can handle most frequent values first and reduce the probability of hash collision
   307  	// by small values.
   308  	for i := uint32(0); i < helper.actualNumTop; i++ {
   309  		h1, h2 := murmur3.Sum128(helper.sorted[i].data)
   310  		realCnt := cms.queryHashValue(h1, h2)
   311  		// Because the encode of topn is the new encode type. But analyze proto returns the old encode type for a sample causet,
   312  		// we should decode it and re-encode it to get the correct bytes.
   313  		d, err := blockcodec.DecodeDeferredCausetValue(helper.sorted[i].data, tp, timeZone)
   314  		if err != nil {
   315  			return err
   316  		}
   317  		data, err := blockcodec.EncodeValue(sc, nil, d)
   318  		if err != nil {
   319  			return err
   320  		}
   321  		cms.subValue(h1, h2, realCnt)
   322  		cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, data, realCnt})
   323  	}
   324  	return nil
   325  }