github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/node_splitter.go (about)

     1  // Copyright 2021 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package tree
    23  
    24  import (
    25  	"crypto/sha512"
    26  	"encoding/binary"
    27  	"math"
    28  
    29  	"github.com/kch42/buzhash"
    30  	"github.com/zeebo/xxh3"
    31  )
    32  
    33  const (
    34  	minChunkSize = 1 << 9
    35  	maxChunkSize = 1 << 14
    36  )
    37  
    38  var levelSalt = [...]uint64{
    39  	saltFromLevel(1),
    40  	saltFromLevel(2),
    41  	saltFromLevel(3),
    42  	saltFromLevel(4),
    43  	saltFromLevel(5),
    44  	saltFromLevel(6),
    45  	saltFromLevel(7),
    46  	saltFromLevel(8),
    47  	saltFromLevel(9),
    48  	saltFromLevel(10),
    49  	saltFromLevel(11),
    50  	saltFromLevel(12),
    51  	saltFromLevel(13),
    52  	saltFromLevel(14),
    53  	saltFromLevel(15),
    54  }
    55  
    56  // splitterFactory makes a nodeSplitter.
    57  type splitterFactory func(level uint8) nodeSplitter
    58  
    59  var defaultSplitterFactory splitterFactory = newKeySplitter
    60  
    61  // nodeSplitter decides where Item streams should be split into chunks.
    62  type nodeSplitter interface {
    63  	// Append provides more nodeItems to the splitter. Splitter's make chunk
    64  	// boundary decisions based on the Item contents. Upon return, callers
    65  	// can use CrossedBoundary() to see if a chunk boundary has crossed.
    66  	Append(key, values Item) error
    67  
    68  	// CrossedBoundary returns true if the provided nodeItems have caused a chunk
    69  	// boundary to be crossed.
    70  	CrossedBoundary() bool
    71  
    72  	// Reset resets the state of the splitter.
    73  	Reset()
    74  }
    75  
    76  // rollingHashSplitter is a nodeSplitter that makes chunk boundary decisions using
    77  // a rolling value hasher that processes Item pairs in a byte-wise fashion.
    78  //
    79  // rollingHashSplitter uses a dynamic hash pattern designed to constrain the chunk
    80  // Size distribution by reducing the likelihood of forming very large or very small
    81  // chunks. As the Size of the current chunk grows, rollingHashSplitter changes the
    82  // target pattern to make it easier to match. The result is a chunk Size distribution
    83  // that is closer to a binomial distribution, rather than geometric.
    84  type rollingHashSplitter struct {
    85  	bz     *buzhash.BuzHash
    86  	offset uint32
    87  	window uint32
    88  	salt   byte
    89  
    90  	crossedBoundary bool
    91  }
    92  
    93  const (
    94  	// The window Size to use for computing the rolling hash. This is way more than necessary assuming random data
    95  	// (two bytes would be sufficient with a target chunk Size of 4k). The benefit of a larger window is it allows
    96  	// for better distribution on input with lower entropy. At a target chunk Size of 4k, any given byte changing
    97  	// has roughly a 1.5% chance of affecting an existing boundary, which seems like an acceptable trade-off. The
    98  	// choice of a prime number provides better distribution for repeating input.
    99  	rollingHashWindow = uint32(67)
   100  )
   101  
   102  var _ nodeSplitter = &rollingHashSplitter{}
   103  
   104  func newRollingHashSplitter(salt uint8) nodeSplitter {
   105  	return &rollingHashSplitter{
   106  		bz:     buzhash.NewBuzHash(rollingHashWindow),
   107  		window: rollingHashWindow,
   108  		salt:   byte(salt),
   109  	}
   110  }
   111  
   112  var _ splitterFactory = newRollingHashSplitter
   113  
   114  // Append implements NodeSplitter
   115  func (sns *rollingHashSplitter) Append(key, value Item) (err error) {
   116  	for _, byt := range key {
   117  		_ = sns.hashByte(byt)
   118  	}
   119  	for _, byt := range value {
   120  		_ = sns.hashByte(byt)
   121  	}
   122  	return nil
   123  }
   124  
   125  func (sns *rollingHashSplitter) hashByte(b byte) bool {
   126  	sns.offset++
   127  
   128  	if sns.crossedBoundary {
   129  		return true
   130  	}
   131  
   132  	sns.bz.HashByte(b ^ sns.salt)
   133  
   134  	if sns.offset < minChunkSize {
   135  		return true
   136  	}
   137  	if sns.offset > maxChunkSize {
   138  		sns.crossedBoundary = true
   139  		return true
   140  	}
   141  
   142  	hash := sns.bz.Sum32()
   143  	patt := rollingHashPattern(sns.offset)
   144  	sns.crossedBoundary = hash&patt == patt
   145  
   146  	return sns.crossedBoundary
   147  }
   148  
   149  // CrossedBoundary implements NodeSplitter
   150  func (sns *rollingHashSplitter) CrossedBoundary() bool {
   151  	return sns.crossedBoundary
   152  }
   153  
   154  // Reset implements NodeSplitter
   155  func (sns *rollingHashSplitter) Reset() {
   156  	sns.crossedBoundary = false
   157  	sns.offset = 0
   158  	sns.bz = buzhash.NewBuzHash(sns.window)
   159  }
   160  
   161  func rollingHashPattern(offset uint32) uint32 {
   162  	shift := 15 - (offset >> 10)
   163  	return 1<<shift - 1
   164  }
   165  
   166  // keySplitter is a nodeSplitter that makes chunk boundary decisions on the hash of
   167  // the key of a Item pair. In contrast to the rollingHashSplitter, keySplitter
   168  // tries to create chunks that have an average number of Item pairs, rather than
   169  // an average number of bytes. However, because the target number of Item pairs
   170  // is computed directly from the chunk size and count, the practical difference in
   171  // the distribution of chunk sizes is minimal.
   172  //
   173  // keySplitter uses a dynamic threshold modeled on a weibull distribution
   174  // (https://en.wikipedia.org/wiki/Weibull_distribution). As the size of the current
   175  // trunk increases, it becomes easier to pass the threshold, reducing the likelihood
   176  // of forming very large or very small chunks.
   177  type keySplitter struct {
   178  	count, size     uint32
   179  	crossedBoundary bool
   180  
   181  	salt uint64
   182  }
   183  
   184  func newKeySplitter(level uint8) nodeSplitter {
   185  	return &keySplitter{
   186  		salt: levelSalt[level],
   187  	}
   188  }
   189  
   190  var _ splitterFactory = newKeySplitter
   191  
   192  func (ks *keySplitter) Append(key, value Item) error {
   193  	thisSize := uint32(len(key) + len(value))
   194  	ks.size += thisSize
   195  
   196  	if ks.size < minChunkSize {
   197  		return nil
   198  	}
   199  	if ks.size > maxChunkSize {
   200  		ks.crossedBoundary = true
   201  		return nil
   202  	}
   203  
   204  	h := xxHash32(key, ks.salt)
   205  	ks.crossedBoundary = weibullCheck(ks.size, thisSize, h)
   206  	return nil
   207  }
   208  
   209  func (ks *keySplitter) CrossedBoundary() bool {
   210  	return ks.crossedBoundary
   211  }
   212  
   213  func (ks *keySplitter) Reset() {
   214  	ks.size = 0
   215  	ks.crossedBoundary = false
   216  }
   217  
   218  const (
   219  	targetSize float64 = 4096
   220  	maxUint32  float64 = math.MaxUint32
   221  
   222  	// weibull params
   223  	K = 4.
   224  
   225  	// TODO: seems like this should be targetSize / math.Gamma(1 + 1/K).
   226  	L = targetSize
   227  )
   228  
   229  // weibullCheck returns true if we should split
   230  // at |hash| for a given record inserted into a
   231  // chunk of size |size|, where the record's size
   232  // is |thisSize|. |size| is the size of the chunk
   233  // after the record is inserted, so includes
   234  // |thisSize| in it.
   235  //
   236  // weibullCheck attempts to form chunks whose
   237  // sizes match the weibull distribution.
   238  //
   239  // The logic is as follows: given that we haven't
   240  // split on any of the records up to |size - thisSize|,
   241  // the probability that we should split on this record
   242  // is (CDF(end) - CDF(start)) / (1 - CDF(start)), or,
   243  // the precentage of the remaining portion of the CDF
   244  // that this record actually covers. We split is |hash|,
   245  // treated as a uniform random number between [0,1),
   246  // is less than this percentage.
   247  func weibullCheck(size, thisSize, hash uint32) bool {
   248  	startx := float64(size - thisSize)
   249  	start := -math.Expm1(-math.Pow(startx/L, K))
   250  
   251  	endx := float64(size)
   252  	end := -math.Expm1(-math.Pow(endx/L, K))
   253  
   254  	p := float64(hash) / maxUint32
   255  	d := 1 - start
   256  	if d <= 0 {
   257  		return true
   258  	}
   259  	target := (end - start) / d
   260  	return p < target
   261  }
   262  
   263  func xxHash32(b []byte, salt uint64) uint32 {
   264  	return uint32(xxh3.HashSeed(b, salt))
   265  }
   266  
   267  func saltFromLevel(level uint8) (salt uint64) {
   268  	full := sha512.Sum512([]byte{level})
   269  	return binary.LittleEndian.Uint64(full[:8])
   270  }