github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/node_splitter_test.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tree
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math"
    21  	"math/bits"
    22  	"math/rand"
    23  	"testing"
    24  
    25  	"github.com/stretchr/testify/assert"
    26  	"github.com/stretchr/testify/require"
    27  
    28  	"github.com/dolthub/dolt/go/store/prolly/message"
    29  	"github.com/dolthub/dolt/go/store/val"
    30  )
    31  
    32  func init() {
    33  	benchData = make([][24]byte, 100_000)
    34  	buf := make([]byte, 24*100_000)
    35  	rand.Read(buf)
    36  	for i := range benchData {
    37  		start, stop := i*24, (i+1)*24
    38  		copy(benchData[i][:], buf[start:stop])
    39  	}
    40  }
    41  
    42  var benchData [][24]byte
    43  
    44  func BenchmarkRollingHashSplitter(b *testing.B) {
    45  	benchmarkNodeSplitter(b, newRollingHashSplitter(0))
    46  }
    47  
    48  func BenchmarkKeySplitter(b *testing.B) {
    49  	benchmarkNodeSplitter(b, newKeySplitter(0))
    50  }
    51  
    52  func benchmarkNodeSplitter(b *testing.B, split nodeSplitter) {
    53  	for i := 0; i < b.N; i++ {
    54  		j := i % len(benchData)
    55  		err := split.Append(benchData[j][:8], benchData[j][8:])
    56  		assert.NoError(b, err)
    57  		if split.CrossedBoundary() {
    58  			split.Reset()
    59  		}
    60  	}
    61  }
    62  
    63  func TestKeySplitterDistribution(t *testing.T) {
    64  	t.Skip("unskip for metrics")
    65  
    66  	factory := newKeySplitter
    67  	t.Run("plot node distribution for item Size 24", func(t *testing.T) {
    68  		scale := 1_000_000
    69  		nd, ns := makeProllyTreeWithSizes(t, factory, scale, 8, 16)
    70  		PrintTreeSummaryByLevel(t, nd, ns)
    71  		plotNodeSizeDistribution(t, "prolly_8_16.png", nd, ns)
    72  	})
    73  	t.Run("summarize node distribution for item sizes (8,54)", func(t *testing.T) {
    74  		for sz := 8; sz <= 54; sz++ {
    75  			fmt.Printf("Summary for map Size %d\n", sz)
    76  			nd, ns := makeProllyTreeWithSizes(t, factory, 100_000, sz, sz)
    77  			PrintTreeSummaryByLevel(t, nd, ns)
    78  			fmt.Println()
    79  		}
    80  	})
    81  	t.Run("plot node distribution for item sizes (8,54)", func(t *testing.T) {
    82  		var cumulative Samples
    83  		for sz := 8; sz <= 54; sz++ {
    84  			nd, ns := makeProllyTreeWithSizes(t, factory, 100_000, sz, sz)
    85  			data, err := measureTreeNodes(nd, ns)
    86  			require.NoError(t, err)
    87  			cumulative = append(cumulative, data...)
    88  		}
    89  		fmt.Println(cumulative.Summary())
    90  		plotIntHistogram("cumulative_node_sizes_8-54.png", cumulative)
    91  	})
    92  }
    93  
    94  func makeProllyTreeWithSizes(t *testing.T, fact splitterFactory, scale, keySz, valSz int) (nd Node, ns NodeStore) {
    95  	pro := gaussianItems{
    96  		keyMean: float64(keySz),
    97  		keyStd:  float64(keySz) / 4,
    98  		valMean: float64(valSz),
    99  		valStd:  float64(valSz) / 4,
   100  		r:       testRand,
   101  	}
   102  
   103  	ctx := context.Background()
   104  	ns = NewTestNodeStore()
   105  	serializer := message.NewProllyMapSerializer(val.TupleDesc{}, ns.Pool())
   106  	chunker, err := newEmptyChunker(ctx, ns, serializer)
   107  	require.NoError(t, err)
   108  
   109  	for i := 0; i < scale; i++ {
   110  		k, v := pro.Next()
   111  		_, err = chunker.append(ctx, k, v, 1)
   112  		require.NoError(t, err)
   113  	}
   114  
   115  	nd, err = chunker.Done(ctx)
   116  	require.NoError(t, err)
   117  	return
   118  }
   119  
   120  type itemProvider interface {
   121  	Next() (key, value Item)
   122  }
   123  
   124  type gaussianItems struct {
   125  	keyMean, keyStd float64
   126  	valMean, valStd float64
   127  	r               *rand.Rand
   128  }
   129  
   130  func (g gaussianItems) Next() (key, value Item) {
   131  	key = make(Item, g.sample(g.keyMean, g.keyStd))
   132  	value = make(Item, g.sample(g.valMean, g.valStd))
   133  	rand.Read(key)
   134  	rand.Read(value)
   135  	return
   136  }
   137  
   138  func (g gaussianItems) sample(mean, std float64) (s int) {
   139  	s = int(math.Round(g.r.NormFloat64()*std + mean))
   140  	if s < 0 {
   141  		s = 0
   142  	}
   143  	return
   144  }
   145  
   146  type staticItems struct {
   147  	key, value int
   148  }
   149  
   150  func (s staticItems) Next() (key, value Item) {
   151  	key = make(Item, s.key)
   152  	value = make(Item, s.value)
   153  	rand.Read(key)
   154  	rand.Read(value)
   155  	return
   156  }
   157  
   158  func TestRoundLog2(t *testing.T) {
   159  	for i := 1; i < 16384; i++ {
   160  		exp := int(math.Round(math.Log2(float64(i))))
   161  		act := int(roundLog2(uint32(i)))
   162  		assert.Equal(t, exp, act)
   163  	}
   164  }
   165  
   166  const (
   167  	// log2MidPoint is 2^31.5
   168  	log2MidPoint = 0b10110101000001001111001100110011
   169  )
   170  
   171  // roundLog2 is an optimized version of
   172  // uint32(math.Round(math.Log2(sz)))
   173  // note: not currently used in any splitter
   174  func roundLog2(sz uint32) (lg uint32) {
   175  	// invariant: |sz| > 1
   176  	lg = uint32(bits.Len32(sz) - 1)
   177  	if sz > (log2MidPoint >> (31 - lg)) {
   178  		lg++
   179  	}
   180  	return
   181  }