github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/hash/hash.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 // Package hash implements the hash function used throughout Noms. 23 // 24 // Noms serialization from version 4-onward uses the first 20 bytes of sha-512 for hashes. 25 // 26 // sha-512 was chosen because: 27 // 28 // - sha-1 is no longer recommended. 29 // - sha-3 is brand new, not a lot of platform support. 30 // - blake is not commonly used, not a lot of platform support. 31 // - within sha-2, sha-512 is faster than sha-256 on 64 bit. 32 // 33 // Our specific truncation scheme (first 20 bytes) was chosen because: 34 // 35 // - The "standard" truncation schemes are not widely supported. For example, at time of writing, there is no fast native implementation of sha512/256 on Node. 36 // - The smallest standard truncation of sha512 is 28 bytes, but we don't need this many. And because we are a database, the size of the hashes matters. Bigger hashes mean less data in each chunk, which means less tree fan-out, which means slower iteration and searching. 20 bytes is a good balance between collision resistance and wide trees. 37 // - 20 bytes leads to a nice round number of base32 digits: 32. 38 // 39 // The textual serialization of hashes uses big-endian base32 with the alphabet {0-9,a-v}. This scheme was chosen because: 40 // 41 // - It's easy to convert to and from base32 without bignum arithemetic. 42 // - No special chars: you can double-click to select in GUIs. 43 // - Sorted hashes will be sorted textually, making it easy to scan for humans. 44 // 45 // In Noms, the hash function is a component of the serialization version, which is constant over the entire lifetime of a single database. So clients do not need to worry about encountering multiple hash functions in the same database. 46 package hash 47 48 import ( 49 "bytes" 50 "crypto/sha512" 51 "encoding/binary" 52 "fmt" 53 "regexp" 54 "strconv" 55 "strings" 56 57 "github.com/dolthub/dolt/go/store/d" 58 ) 59 60 const ( 61 // ByteLen is the number of bytes used to represent the Hash. 62 ByteLen = 20 63 64 // PrefixLen is the number of bytes used to represent the Prefix of the Hash. 65 PrefixLen = 8 // uint64 66 67 // SuffixLen is the number of bytes which come after the Prefix. 68 SuffixLen = ByteLen - PrefixLen 69 70 // StringLen is the number of characters need to represent the Hash using Base32. 71 StringLen = 32 // 20 * 8 / log2(32) 72 ) 73 74 var ( 75 pattern = regexp.MustCompile("^([0-9a-v]{" + strconv.Itoa(StringLen) + "})$") 76 emptyHash = Hash{} 77 ) 78 79 // Hash is used to represent the hash of a Noms Value. 80 type Hash [ByteLen]byte 81 82 // IsEmpty determines if this Hash is equal to the empty hash (all zeroes). 83 func (h Hash) IsEmpty() bool { 84 return h == emptyHash 85 } 86 87 // String returns a string representation of the hash using Base32 encoding. 88 func (h Hash) String() string { 89 return encode(h[:]) 90 } 91 92 // Of computes a new Hash from data. 93 func Of(data []byte) Hash { 94 r := sha512.Sum512(data) 95 h := Hash{} 96 copy(h[:], r[:ByteLen]) 97 return h 98 } 99 100 // New creates a new Hash backed by data, ensuring that data is an acceptable length. 101 func New(data []byte) Hash { 102 d.PanicIfFalse(len(data) == ByteLen) 103 h := Hash{} 104 copy(h[:], data) 105 return h 106 } 107 108 // MaybeParse parses a string representing a hash as a Base32 encoded byte array. 109 // If the string is not well formed then this returns (emptyHash, false). 110 func MaybeParse(s string) (Hash, bool) { 111 match := pattern.FindStringSubmatch(s) 112 if match == nil { 113 return emptyHash, false 114 } 115 return New(decode(s)), true 116 } 117 118 // IsValid returns true if the provided string is a valid base32 encoded hash and false if it is not. 119 func IsValid(s string) bool { 120 return pattern.MatchString(s) 121 } 122 123 // Parse parses a string representing a hash as a Base32 encoded byte array. 124 // If the string is not well formed then this panics. 125 func Parse(s string) Hash { 126 r, ok := MaybeParse(s) 127 if !ok { 128 d.PanicIfError(fmt.Errorf("cound not parse Hash: %s", s)) 129 } 130 return r 131 } 132 133 // Prefix returns the first 8 bytes of the hash as a unit64. Used for chunk indexing 134 func (h Hash) Prefix() uint64 { 135 return binary.BigEndian.Uint64(h[:PrefixLen]) 136 } 137 138 // Suffix returns the last 12 bytes of the hash. Used for chunk indexing 139 func (h Hash) Suffix() []byte { 140 return h[PrefixLen:] 141 } 142 143 // Less compares two hashes returning whether this Hash is less than other. 144 func (h Hash) Less(other Hash) bool { 145 return h.Compare(other) < 0 146 } 147 148 // Compare compares two hashes returning a negative value if h < other, 0 if h == other, and 1 if h > other 149 func (h Hash) Compare(other Hash) int { 150 return bytes.Compare(h[:], other[:]) 151 } 152 153 // Equal compares two hashes returning whether this Hash is equal to other. 154 func (h Hash) Equal(other Hash) bool { 155 return h.Compare(other) == 0 156 } 157 158 // HashSet is a set of Hashes. 159 type HashSet map[Hash]struct{} 160 161 func NewHashSet(hashes ...Hash) HashSet { 162 out := make(HashSet, len(hashes)) 163 for _, h := range hashes { 164 out.Insert(h) 165 } 166 return out 167 } 168 169 func (hs HashSet) Size() int { 170 return len(hs) 171 } 172 173 // Insert adds a Hash to the set. 174 func (hs HashSet) Insert(hash Hash) { 175 hs[hash] = struct{}{} 176 } 177 178 // Has returns true if the HashSet contains hash. 179 func (hs HashSet) Has(hash Hash) bool { 180 _, has := hs[hash] 181 return has 182 } 183 184 // Remove removes hash from the HashSet. 185 func (hs HashSet) Remove(hash Hash) { 186 delete(hs, hash) 187 } 188 189 // Copy returns a copy of the hashset 190 func (hs HashSet) Copy() HashSet { 191 copyOf := make(HashSet, len(hs)) 192 193 for k := range hs { 194 copyOf[k] = struct{}{} 195 } 196 197 return copyOf 198 } 199 200 // InsertAll inserts all elements of a HashSet into this HashSet 201 func (hs HashSet) InsertAll(other HashSet) { 202 for h, _ := range other { 203 hs[h] = struct{}{} 204 } 205 } 206 207 func (hs HashSet) Equals(other HashSet) bool { 208 if hs.Size() != other.Size() { 209 return false 210 } 211 for h := range hs { 212 if !other.Has(h) { 213 return false 214 } 215 } 216 return true 217 } 218 219 func (hs HashSet) Empty() { 220 for h := range hs { 221 delete(hs, h) 222 } 223 } 224 225 func (hs HashSet) String() string { 226 var sb strings.Builder 227 sb.Grow(len(hs)*34 + 100) 228 229 sb.WriteString("HashSet {\n") 230 for h := range hs { 231 sb.WriteString("\t") 232 sb.WriteString(h.String()) 233 sb.WriteString("\n") 234 } 235 sb.WriteString("}\n") 236 return sb.String() 237 }