github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/hash/hash.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  // Package hash implements the hash function used throughout Noms.
    23  //
    24  // Noms serialization from version 4-onward uses the first 20 bytes of sha-512 for hashes.
    25  //
    26  // sha-512 was chosen because:
    27  //
    28  // - sha-1 is no longer recommended.
    29  // - sha-3 is brand new, not a lot of platform support.
    30  // - blake is not commonly used, not a lot of platform support.
    31  // - within sha-2, sha-512 is faster than sha-256 on 64 bit.
    32  //
    33  // Our specific truncation scheme (first 20 bytes) was chosen because:
    34  //
    35  // - The "standard" truncation schemes are not widely supported. For example, at time of writing, there is no fast native implementation of sha512/256 on Node.
    36  // - The smallest standard truncation of sha512 is 28 bytes, but we don't need this many. And because we are a database, the size of the hashes matters. Bigger hashes mean less data in each chunk, which means less tree fan-out, which means slower iteration and searching. 20 bytes is a good balance between collision resistance and wide trees.
    37  // - 20 bytes leads to a nice round number of base32 digits: 32.
    38  //
    39  // The textual serialization of hashes uses big-endian base32 with the alphabet {0-9,a-v}. This scheme was chosen because:
    40  //
    41  // - It's easy to convert to and from base32 without bignum arithemetic.
    42  // - No special chars: you can double-click to select in GUIs.
    43  // - Sorted hashes will be sorted textually, making it easy to scan for humans.
    44  //
    45  // In Noms, the hash function is a component of the serialization version, which is constant over the entire lifetime of a single database. So clients do not need to worry about encountering multiple hash functions in the same database.
    46  package hash
    47  
    48  import (
    49  	"bytes"
    50  	"crypto/sha512"
    51  	"encoding/binary"
    52  	"fmt"
    53  	"regexp"
    54  	"strconv"
    55  	"strings"
    56  
    57  	"github.com/dolthub/dolt/go/store/d"
    58  )
    59  
    60  const (
    61  	// ByteLen is the number of bytes used to represent the Hash.
    62  	ByteLen = 20
    63  
    64  	// PrefixLen is the number of bytes used to represent the Prefix of the Hash.
    65  	PrefixLen = 8 // uint64
    66  
    67  	// SuffixLen is the number of bytes which come after the Prefix.
    68  	SuffixLen = ByteLen - PrefixLen
    69  
    70  	// StringLen is the number of characters need to represent the Hash using Base32.
    71  	StringLen = 32 // 20 * 8 / log2(32)
    72  )
    73  
    74  var (
    75  	pattern   = regexp.MustCompile("^([0-9a-v]{" + strconv.Itoa(StringLen) + "})$")
    76  	emptyHash = Hash{}
    77  )
    78  
    79  // Hash is used to represent the hash of a Noms Value.
    80  type Hash [ByteLen]byte
    81  
    82  // IsEmpty determines if this Hash is equal to the empty hash (all zeroes).
    83  func (h Hash) IsEmpty() bool {
    84  	return h == emptyHash
    85  }
    86  
    87  // String returns a string representation of the hash using Base32 encoding.
    88  func (h Hash) String() string {
    89  	return encode(h[:])
    90  }
    91  
    92  // Of computes a new Hash from data.
    93  func Of(data []byte) Hash {
    94  	r := sha512.Sum512(data)
    95  	h := Hash{}
    96  	copy(h[:], r[:ByteLen])
    97  	return h
    98  }
    99  
   100  // New creates a new Hash backed by data, ensuring that data is an acceptable length.
   101  func New(data []byte) Hash {
   102  	d.PanicIfFalse(len(data) == ByteLen)
   103  	h := Hash{}
   104  	copy(h[:], data)
   105  	return h
   106  }
   107  
   108  // MaybeParse parses a string representing a hash as a Base32 encoded byte array.
   109  // If the string is not well formed then this returns (emptyHash, false).
   110  func MaybeParse(s string) (Hash, bool) {
   111  	match := pattern.FindStringSubmatch(s)
   112  	if match == nil {
   113  		return emptyHash, false
   114  	}
   115  	return New(decode(s)), true
   116  }
   117  
   118  // IsValid returns true if the provided string is a valid base32 encoded hash and false if it is not.
   119  func IsValid(s string) bool {
   120  	return pattern.MatchString(s)
   121  }
   122  
   123  // Parse parses a string representing a hash as a Base32 encoded byte array.
   124  // If the string is not well formed then this panics.
   125  func Parse(s string) Hash {
   126  	r, ok := MaybeParse(s)
   127  	if !ok {
   128  		d.PanicIfError(fmt.Errorf("cound not parse Hash: %s", s))
   129  	}
   130  	return r
   131  }
   132  
   133  // Prefix returns the first 8 bytes of the hash as a unit64. Used for chunk indexing
   134  func (h Hash) Prefix() uint64 {
   135  	return binary.BigEndian.Uint64(h[:PrefixLen])
   136  }
   137  
   138  // Suffix returns the last 12 bytes of the hash. Used for chunk indexing
   139  func (h Hash) Suffix() []byte {
   140  	return h[PrefixLen:]
   141  }
   142  
   143  // Less compares two hashes returning whether this Hash is less than other.
   144  func (h Hash) Less(other Hash) bool {
   145  	return h.Compare(other) < 0
   146  }
   147  
   148  // Compare compares two hashes returning a negative value if h < other, 0 if h == other, and 1 if h > other
   149  func (h Hash) Compare(other Hash) int {
   150  	return bytes.Compare(h[:], other[:])
   151  }
   152  
   153  // Equal compares two hashes returning whether this Hash is equal to other.
   154  func (h Hash) Equal(other Hash) bool {
   155  	return h.Compare(other) == 0
   156  }
   157  
   158  // HashSet is a set of Hashes.
   159  type HashSet map[Hash]struct{}
   160  
   161  func NewHashSet(hashes ...Hash) HashSet {
   162  	out := make(HashSet, len(hashes))
   163  	for _, h := range hashes {
   164  		out.Insert(h)
   165  	}
   166  	return out
   167  }
   168  
   169  func (hs HashSet) Size() int {
   170  	return len(hs)
   171  }
   172  
   173  // Insert adds a Hash to the set.
   174  func (hs HashSet) Insert(hash Hash) {
   175  	hs[hash] = struct{}{}
   176  }
   177  
   178  // Has returns true if the HashSet contains hash.
   179  func (hs HashSet) Has(hash Hash) bool {
   180  	_, has := hs[hash]
   181  	return has
   182  }
   183  
   184  // Remove removes hash from the HashSet.
   185  func (hs HashSet) Remove(hash Hash) {
   186  	delete(hs, hash)
   187  }
   188  
   189  // Copy returns a copy of the hashset
   190  func (hs HashSet) Copy() HashSet {
   191  	copyOf := make(HashSet, len(hs))
   192  
   193  	for k := range hs {
   194  		copyOf[k] = struct{}{}
   195  	}
   196  
   197  	return copyOf
   198  }
   199  
   200  // InsertAll inserts all elements of a HashSet into this HashSet
   201  func (hs HashSet) InsertAll(other HashSet) {
   202  	for h, _ := range other {
   203  		hs[h] = struct{}{}
   204  	}
   205  }
   206  
   207  func (hs HashSet) Equals(other HashSet) bool {
   208  	if hs.Size() != other.Size() {
   209  		return false
   210  	}
   211  	for h := range hs {
   212  		if !other.Has(h) {
   213  			return false
   214  		}
   215  	}
   216  	return true
   217  }
   218  
   219  func (hs HashSet) Empty() {
   220  	for h := range hs {
   221  		delete(hs, h)
   222  	}
   223  }
   224  
   225  func (hs HashSet) String() string {
   226  	var sb strings.Builder
   227  	sb.Grow(len(hs)*34 + 100)
   228  
   229  	sb.WriteString("HashSet {\n")
   230  	for h := range hs {
   231  		sb.WriteString("\t")
   232  		sb.WriteString(h.String())
   233  		sb.WriteString("\n")
   234  	}
   235  	sb.WriteString("}\n")
   236  	return sb.String()
   237  }