github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/node_splitter_test.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tree 16 17 import ( 18 "context" 19 "fmt" 20 "math" 21 "math/bits" 22 "math/rand" 23 "testing" 24 25 "github.com/stretchr/testify/assert" 26 "github.com/stretchr/testify/require" 27 28 "github.com/dolthub/dolt/go/store/prolly/message" 29 "github.com/dolthub/dolt/go/store/val" 30 ) 31 32 func init() { 33 benchData = make([][24]byte, 100_000) 34 buf := make([]byte, 24*100_000) 35 rand.Read(buf) 36 for i := range benchData { 37 start, stop := i*24, (i+1)*24 38 copy(benchData[i][:], buf[start:stop]) 39 } 40 } 41 42 var benchData [][24]byte 43 44 func BenchmarkRollingHashSplitter(b *testing.B) { 45 benchmarkNodeSplitter(b, newRollingHashSplitter(0)) 46 } 47 48 func BenchmarkKeySplitter(b *testing.B) { 49 benchmarkNodeSplitter(b, newKeySplitter(0)) 50 } 51 52 func benchmarkNodeSplitter(b *testing.B, split nodeSplitter) { 53 for i := 0; i < b.N; i++ { 54 j := i % len(benchData) 55 err := split.Append(benchData[j][:8], benchData[j][8:]) 56 assert.NoError(b, err) 57 if split.CrossedBoundary() { 58 split.Reset() 59 } 60 } 61 } 62 63 func TestKeySplitterDistribution(t *testing.T) { 64 t.Skip("unskip for metrics") 65 66 factory := newKeySplitter 67 t.Run("plot node distribution for item Size 24", func(t *testing.T) { 68 scale := 1_000_000 69 nd, ns := makeProllyTreeWithSizes(t, factory, scale, 8, 16) 70 PrintTreeSummaryByLevel(t, nd, ns) 71 plotNodeSizeDistribution(t, "prolly_8_16.png", nd, ns) 72 }) 73 t.Run("summarize node distribution for item sizes (8,54)", func(t *testing.T) { 74 for sz := 8; sz <= 54; sz++ { 75 fmt.Printf("Summary for map Size %d\n", sz) 76 nd, ns := makeProllyTreeWithSizes(t, factory, 100_000, sz, sz) 77 PrintTreeSummaryByLevel(t, nd, ns) 78 fmt.Println() 79 } 80 }) 81 t.Run("plot node distribution for item sizes (8,54)", func(t *testing.T) { 82 var cumulative Samples 83 for sz := 8; sz <= 54; sz++ { 84 nd, ns := makeProllyTreeWithSizes(t, factory, 100_000, sz, sz) 85 data, err := measureTreeNodes(nd, ns) 86 require.NoError(t, err) 87 cumulative = append(cumulative, data...) 88 } 89 fmt.Println(cumulative.Summary()) 90 plotIntHistogram("cumulative_node_sizes_8-54.png", cumulative) 91 }) 92 } 93 94 func makeProllyTreeWithSizes(t *testing.T, fact splitterFactory, scale, keySz, valSz int) (nd Node, ns NodeStore) { 95 pro := gaussianItems{ 96 keyMean: float64(keySz), 97 keyStd: float64(keySz) / 4, 98 valMean: float64(valSz), 99 valStd: float64(valSz) / 4, 100 r: testRand, 101 } 102 103 ctx := context.Background() 104 ns = NewTestNodeStore() 105 serializer := message.NewProllyMapSerializer(val.TupleDesc{}, ns.Pool()) 106 chunker, err := newEmptyChunker(ctx, ns, serializer) 107 require.NoError(t, err) 108 109 for i := 0; i < scale; i++ { 110 k, v := pro.Next() 111 _, err = chunker.append(ctx, k, v, 1) 112 require.NoError(t, err) 113 } 114 115 nd, err = chunker.Done(ctx) 116 require.NoError(t, err) 117 return 118 } 119 120 type itemProvider interface { 121 Next() (key, value Item) 122 } 123 124 type gaussianItems struct { 125 keyMean, keyStd float64 126 valMean, valStd float64 127 r *rand.Rand 128 } 129 130 func (g gaussianItems) Next() (key, value Item) { 131 key = make(Item, g.sample(g.keyMean, g.keyStd)) 132 value = make(Item, g.sample(g.valMean, g.valStd)) 133 rand.Read(key) 134 rand.Read(value) 135 return 136 } 137 138 func (g gaussianItems) sample(mean, std float64) (s int) { 139 s = int(math.Round(g.r.NormFloat64()*std + mean)) 140 if s < 0 { 141 s = 0 142 } 143 return 144 } 145 146 type staticItems struct { 147 key, value int 148 } 149 150 func (s staticItems) Next() (key, value Item) { 151 key = make(Item, s.key) 152 value = make(Item, s.value) 153 rand.Read(key) 154 rand.Read(value) 155 return 156 } 157 158 func TestRoundLog2(t *testing.T) { 159 for i := 1; i < 16384; i++ { 160 exp := int(math.Round(math.Log2(float64(i)))) 161 act := int(roundLog2(uint32(i))) 162 assert.Equal(t, exp, act) 163 } 164 } 165 166 const ( 167 // log2MidPoint is 2^31.5 168 log2MidPoint = 0b10110101000001001111001100110011 169 ) 170 171 // roundLog2 is an optimized version of 172 // uint32(math.Round(math.Log2(sz))) 173 // note: not currently used in any splitter 174 func roundLog2(sz uint32) (lg uint32) { 175 // invariant: |sz| > 1 176 lg = uint32(bits.Len32(sz) - 1) 177 if sz > (log2MidPoint >> (31 - lg)) { 178 lg++ 179 } 180 return 181 }