github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/benchmarks/gen/rolling_value_hasher.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package gen
    23  
    24  import "github.com/kch42/buzhash"
    25  
    26  const (
    27  	chunkPattern = uint32(1<<12 - 1) // Avg Chunk Size of 4k
    28  
    29  	// The window size to use for computing the rolling hash. This is way more than necessary assuming random data (two bytes would be sufficient with a target chunk size of 4k). The benefit of a larger window is it allows for better distribution on input with lower entropy. At a target chunk size of 4k, any given byte changing has roughly a 1.5% chance of affecting an existing boundary, which seems like an acceptable trade-off.
    30  	chunkWindow = uint32(64)
    31  )
    32  
    33  type rollingValueHasher struct {
    34  	bz *buzhash.BuzHash
    35  }
    36  
    37  func newRollingValueHasher() *rollingValueHasher {
    38  	return &rollingValueHasher{buzhash.NewBuzHash(chunkWindow)}
    39  }
    40  
    41  func (rv *rollingValueHasher) HashByte(b byte) bool {
    42  	rv.bz.HashByte(b)
    43  	return rv.bz.Sum32()&chunkPattern == chunkPattern
    44  }