github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/hash/hash.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 // Package hash implements the hash function used throughout Noms. 23 // 24 // Noms serialization from version 4-onward uses the first 20 bytes of sha-512 for hashes. 25 // 26 // sha-512 was chosen because: 27 // 28 // - sha-1 is no longer recommended. 29 // - sha-3 is brand new, not a lot of platform support. 30 // - blake is not commonly used, not a lot of platform support. 31 // - within sha-2, sha-512 is faster than sha-256 on 64 bit. 32 // 33 // Our specific truncation scheme (first 20 bytes) was chosen because: 34 // 35 // - The "standard" truncation schemes are not widely supported. For example, at time of writing, there is no fast native implementation of sha512/256 on Node. 36 // - The smallest standard truncation of sha512 is 28 bytes, but we don't need this many. And because we are a database, the size of the hashes matters. Bigger hashes mean less data in each chunk, which means less tree fan-out, which means slower iteration and searching. 20 bytes is a good balance between collision resistance and wide trees. 37 // - 20 bytes leads to a nice round number of base32 digits: 32. 38 // 39 // The textual serialization of hashes uses big-endian base32 with the alphabet {0-9,a-v}. This scheme was chosen because: 40 // 41 // - It's easy to convert to and from base32 without bignum arithemetic. 42 // - No special chars: you can double-click to select in GUIs. 43 // - Sorted hashes will be sorted textually, making it easy to scan for humans. 44 // 45 // In Noms, the hash function is a component of the serialization version, which is constant over the entire lifetime of a single database. So clients do not need to worry about encountering multiple hash functions in the same database. 46 package hash 47 48 import ( 49 "bytes" 50 "crypto/sha512" 51 "fmt" 52 "regexp" 53 "strconv" 54 55 "github.com/dolthub/dolt/go/store/d" 56 ) 57 58 const ( 59 // ByteLen is the number of bytes used to represent the Hash. 60 ByteLen = 20 61 62 // StringLen is the number of characters need to represent the Hash using Base32. 63 StringLen = 32 // 20 * 8 / log2(32) 64 ) 65 66 var ( 67 pattern = regexp.MustCompile("^([0-9a-v]{" + strconv.Itoa(StringLen) + "})$") 68 emptyHash = Hash{} 69 ) 70 71 // Hash is used to represent the hash of a Noms Value. 72 type Hash [ByteLen]byte 73 74 // IsEmpty determines if this Hash is equal to the empty hash (all zeroes). 75 func (h Hash) IsEmpty() bool { 76 return h == emptyHash 77 } 78 79 // String returns a string representation of the hash using Base32 encoding. 80 func (h Hash) String() string { 81 return encode(h[:]) 82 } 83 84 // Of computes a new Hash from data. 85 func Of(data []byte) Hash { 86 r := sha512.Sum512(data) 87 h := Hash{} 88 copy(h[:], r[:ByteLen]) 89 return h 90 } 91 92 // New creates a new Hash backed by data, ensuring that data is an acceptable length. 93 func New(data []byte) Hash { 94 d.PanicIfFalse(len(data) == ByteLen) 95 h := Hash{} 96 copy(h[:], data) 97 return h 98 } 99 100 // MaybeParse parses a string representing a hash as a Base32 encoded byte array. 101 // If the string is not well formed then this returns (emptyHash, false). 102 func MaybeParse(s string) (Hash, bool) { 103 match := pattern.FindStringSubmatch(s) 104 if match == nil { 105 return emptyHash, false 106 } 107 return New(decode(s)), true 108 } 109 110 // IsValid returns true if the provided string is a valid base32 encoded hash and false if it is not. 111 func IsValid(s string) bool { 112 return pattern.MatchString(s) 113 } 114 115 // Parse parses a string representing a hash as a Base32 encoded byte array. 116 // If the string is not well formed then this panics. 117 func Parse(s string) Hash { 118 r, ok := MaybeParse(s) 119 if !ok { 120 d.PanicIfError(fmt.Errorf("cound not parse Hash: %s", s)) 121 } 122 return r 123 } 124 125 // Less compares two hashes returning whether this Hash is less than other. 126 func (h Hash) Less(other Hash) bool { 127 return bytes.Compare(h[:], other[:]) < 0 128 } 129 130 // Greater compares two hashes returning whether this Hash is greater than other. 131 func (h Hash) Greater(other Hash) bool { 132 // TODO: Remove this 133 return bytes.Compare(h[:], other[:]) > 0 134 } 135 136 // Equal compares two hashes returning whether this Hash is equal to other. 137 func (h Hash) Equal(other Hash) bool { 138 return bytes.Compare(h[:], other[:]) == 0 139 } 140 141 // HashSet is a set of Hashes. 142 type HashSet map[Hash]struct{} 143 144 func NewHashSet(hashes ...Hash) HashSet { 145 out := make(HashSet, len(hashes)) 146 for _, h := range hashes { 147 out.Insert(h) 148 } 149 return out 150 } 151 152 // Insert adds a Hash to the set. 153 func (hs HashSet) Insert(hash Hash) { 154 hs[hash] = struct{}{} 155 } 156 157 // Has returns true if the HashSet contains hash. 158 func (hs HashSet) Has(hash Hash) (has bool) { 159 _, has = hs[hash] 160 return 161 } 162 163 // Remove removes hash from the HashSet. 164 func (hs HashSet) Remove(hash Hash) { 165 delete(hs, hash) 166 }