github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/go/hash/hash.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  // Package hash implements the hash function used throughout Noms.
     6  //
     7  // Noms serialization from version 4-onward uses the first 20 bytes of sha-512 for hashes.
     8  //
     9  // sha-512 was chosen because:
    10  //
    11  // - sha-1 is no longer recommended.
    12  // - sha-3 is brand new, not a lot of platform support.
    13  // - blake is not commonly used, not a lot of platform support.
    14  // - within sha-2, sha-512 is faster than sha-256 on 64 bit.
    15  //
    16  // Our specific truncation scheme (first 20 bytes) was chosen because:
    17  //
    18  // - The "standard" truncation schemes are not widely supported. For example, at time of writing, there is no fast native implementation of sha512/256 on Node.
    19  // - The smallest standard truncation of sha512 is 28 bytes, but we don't need this many. And because we are a database, the size of the hashes matters. Bigger hashes mean less data in each chunk, which means less tree fan-out, which means slower iteration and searching. 20 bytes is a good balance between collision resistance and wide trees.
    20  // - 20 bytes leads to a nice round number of base32 digits: 32.
    21  //
    22  // The textual serialization of hashes uses big-endian base32 with the alphabet {0-9,a-v}. This scheme was chosen because:
    23  //
    24  // - It's easy to convert to and from base32 without bignum arithemetic.
    25  // - No special chars: you can double-click to select in GUIs.
    26  // - Sorted hashes will be sorted textually, making it easy to scan for humans.
    27  //
    28  // In Noms, the hash function is a component of the serialization version, which is constant over the entire lifetime of a single database. So clients do not need to worry about encountering multiple hash functions in the same database.
    29  package hash
    30  
    31  import (
    32  	"bytes"
    33  	"crypto/sha512"
    34  	"fmt"
    35  	"regexp"
    36  	"strconv"
    37  
    38  	"github.com/attic-labs/noms/go/d"
    39  )
    40  
    41  const (
    42  	// ByteLen is the number of bytes used to represent the Hash.
    43  	ByteLen = 20
    44  
    45  	// StringLen is the number of characters need to represent the Hash using Base32.
    46  	StringLen = 32 // 20 * 8 / log2(32)
    47  )
    48  
    49  var (
    50  	pattern   = regexp.MustCompile("^([0-9a-v]{" + strconv.Itoa(StringLen) + "})$")
    51  	emptyHash = Hash{}
    52  )
    53  
    54  // Hash is used to represent the hash of a Noms Value.
    55  type Hash [ByteLen]byte
    56  
    57  // IsEmpty determines if this Hash is equal to the empty hash (all zeroes).
    58  func (h Hash) IsEmpty() bool {
    59  	return h == emptyHash
    60  }
    61  
    62  // String returns a string representation of the hash using Base32 encoding.
    63  func (h Hash) String() string {
    64  	return encode(h[:])
    65  }
    66  
    67  // Of computes a new Hash from data.
    68  func Of(data []byte) Hash {
    69  	r := sha512.Sum512(data)
    70  	h := Hash{}
    71  	copy(h[:], r[:ByteLen])
    72  	return h
    73  }
    74  
    75  // New creates a new Hash backed by data, ensuring that data is an acceptable length.
    76  func New(data []byte) Hash {
    77  	d.PanicIfFalse(len(data) == ByteLen)
    78  	h := Hash{}
    79  	copy(h[:], data)
    80  	return h
    81  }
    82  
    83  // MaybeParse parses a string representing a hash as a Base32 encoded byte array.
    84  // If the string is not well formed then this returns (emptyHash, false).
    85  func MaybeParse(s string) (Hash, bool) {
    86  	match := pattern.FindStringSubmatch(s)
    87  	if match == nil {
    88  		return emptyHash, false
    89  	}
    90  	return New(decode(s)), true
    91  }
    92  
    93  // Parse parses a string representing a hash as a Base32 encoded byte array.
    94  // If the string is not well formed then this panics.
    95  func Parse(s string) Hash {
    96  	r, ok := MaybeParse(s)
    97  	if !ok {
    98  		d.PanicIfError(fmt.Errorf("Cound not parse Hash: %s", s))
    99  	}
   100  	return r
   101  }
   102  
   103  // Less compares two hashes returning whether this Hash is less than other.
   104  func (h Hash) Less(other Hash) bool {
   105  	return bytes.Compare(h[:], other[:]) < 0
   106  }
   107  
   108  // Greater compares two hashes returning whether this Hash is greater than other.
   109  func (h Hash) Greater(other Hash) bool {
   110  	// TODO: Remove this
   111  	return bytes.Compare(h[:], other[:]) > 0
   112  }
   113  
   114  // HashSet is a set of Hashes.
   115  type HashSet map[Hash]struct{}
   116  
   117  func NewHashSet(hashes ...Hash) HashSet {
   118  	out := make(HashSet, len(hashes))
   119  	for _, h := range hashes {
   120  		out.Insert(h)
   121  	}
   122  	return out
   123  }
   124  
   125  // Insert adds a Hash to the set.
   126  func (hs HashSet) Insert(hash Hash) {
   127  	hs[hash] = struct{}{}
   128  }
   129  
   130  // Has returns true if the HashSet contains hash.
   131  func (hs HashSet) Has(hash Hash) (has bool) {
   132  	_, has = hs[hash]
   133  	return
   134  }
   135  
   136  // Remove removes hash from the HashSet.
   137  func (hs HashSet) Remove(hash Hash) {
   138  	delete(hs, hash)
   139  }