github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "crypto/sha512" 28 "encoding/base32" 29 "encoding/binary" 30 "hash/crc32" 31 "io" 32 33 "golang.org/x/sync/errgroup" 34 35 "github.com/dolthub/dolt/go/store/chunks" 36 "github.com/dolthub/dolt/go/store/hash" 37 ) 38 39 /* 40 An NBS Table stores N byte slices ("chunks") which are addressed by a 20-byte hash of their 41 contents. The footer encodes N as well as the total bytes consumed by all contained chunks. 42 An Index maps each address to the position of its corresponding chunk. Addresses are logically sorted within the Index, but the corresponding chunks need not be. 43 44 Table: 45 +----------------+----------------+-----+----------------+-------+--------+ 46 | Chunk Record 0 | Chunk Record 1 | ... | Chunk Record N | Index | Footer | 47 +----------------+----------------+-----+----------------+-------+--------+ 48 49 Chunk Record: 50 +---------------------------+----------------+ 51 | (Chunk Length) Chunk Data | (Uint32) CRC32 | 52 +---------------------------+----------------+ 53 54 Index: 55 +------------+---------+----------+ 56 | Prefix Map | Lengths | Suffixes | 57 +------------+---------+----------+ 58 59 Prefix Map: 60 +--------------+--------------+-----+----------------+ 61 | Prefix Tuple | Prefix Tuple | ... | Prefix Tuple N | 62 +--------------+--------------+-----+----------------+ 63 64 -The Prefix Map contains N Prefix Tuples. 65 -Each Prefix Tuple corresponds to a unique Chunk Record in the Table. 66 -The Prefix Tuples are sorted in increasing lexicographic order within the Prefix Map. 67 -NB: THE SAME PREFIX MAY APPEAR MULTIPLE TIMES, as distinct Hashes (referring to distinct Chunks) may share the same Prefix. 68 69 Prefix Tuple: 70 +-----------------+------------------+ 71 | (8) Hash Prefix | (Uint32) Ordinal | 72 +-----------------+------------------+ 73 74 -First 8 bytes of a Chunk's Hash 75 -Ordinal is the 0-based ordinal position of the associated record within the sequence of chunk records, the associated Length within Lengths, and the associated Hash Suffix within Suffixes. 76 77 Lengths: 78 +-----------------+-----------------+-----+-------------------+ 79 | (Uint32) Length | (Uint32) Length | ... | (Uint32) Length N | 80 +-----------------+-----------------+-----+-------------------+ 81 82 - Each Length is the length of a Chunk Record in this Table. 83 - Length M must correspond to Chunk Record M for 0 <= M <= N 84 85 Suffixes: 86 +------------------+------------------+-----+--------------------+ 87 | (12) Hash Suffix | (12) Hash Suffix | ... | (12) Hash Suffix N | 88 +------------------+------------------+-----+--------------------+ 89 90 - Each Hash Suffix is the last 12 bytes of a Chunk in this Table. 91 - Hash Suffix M must correspond to Chunk Record M for 0 <= M <= N 92 93 Footer: 94 +----------------------+----------------------------------------+------------------+ 95 | (Uint32) Chunk Count | (Uint64) Total Uncompressed Chunk Data | (8) Magic Number | 96 +----------------------+----------------------------------------+------------------+ 97 98 -Total Uncompressed Chunk Data is the sum of the uncompressed byte lengths of all contained chunk byte slices. 99 -Magic Number is the first 8 bytes of the SHA256 hash of "https://github.com/attic-labs/nbs". 100 101 NOTE: Unsigned integer quanities, hashes and hash suffix are all encoded big-endian 102 103 104 Looking up Chunks in an NBS Table 105 There are two phases to loading chunk data for a given Hash from an NBS Table: Checking for the chunk's presence, and fetching the chunk's bytes. When performing a has-check, only the first phase is necessary. 106 107 Phase one: Chunk presence 108 - Slice off the first 8 bytes of your Hash to create a Prefix 109 - Since the Prefix Tuples in the Prefix Map are in lexicographic order, binary search the Prefix Map for the desired Prefix. 110 - For all Prefix Tuples with a matching Prefix: 111 - Load the Ordinal 112 - Use the Ordinal to index into Suffixes 113 - Check the Suffix of your Hash against the loaded Suffix 114 - If they match, your chunk is in this Table in the Chunk Record indicated by Ordinal 115 - If they don't match, continue to the next matching Prefix Tuple 116 - If not found, your chunk is not in this Table. 117 118 Phase two: Loading Chunk data 119 - Take the Ordinal discovered in Phase one 120 - Calculate the Offset of your desired Chunk Record: Sum(Lengths[0]...Lengths[Ordinal-1]) 121 - Load Lengths[Ordinal] bytes from Table[Offset] 122 - Check the first 4 bytes of the loaded data against the last 4 bytes of your desired Hash. They should match, and the rest of the data is your Chunk data. 123 */ 124 125 const ( 126 addrSize = 20 127 addrPrefixSize = 8 128 addrSuffixSize = addrSize - addrPrefixSize 129 uint64Size = 8 130 uint32Size = 4 131 ordinalSize = uint32Size 132 lengthSize = uint32Size 133 magicNumber = "\xff\xb5\xd8\xc2\x24\x63\xee\x50" 134 magicNumberSize = 8 //len(magicNumber) 135 footerSize = uint32Size + uint64Size + magicNumberSize 136 prefixTupleSize = addrPrefixSize + ordinalSize 137 checksumSize = uint32Size 138 maxChunkSize = 0xffffffff // Snappy won't compress slices bigger than this 139 ) 140 141 var crcTable = crc32.MakeTable(crc32.Castagnoli) 142 143 func crc(b []byte) uint32 { 144 return crc32.Update(0, crcTable, b) 145 } 146 147 func computeAddrDefault(data []byte) addr { 148 r := sha512.Sum512(data) 149 h := addr{} 150 copy(h[:], r[:addrSize]) 151 return h 152 } 153 154 var computeAddr = computeAddrDefault 155 156 type addr [addrSize]byte 157 158 var encoding = base32.NewEncoding("0123456789abcdefghijklmnopqrstuv") 159 160 func (a addr) String() string { 161 return encoding.EncodeToString(a[:]) 162 } 163 164 func (a addr) Prefix() uint64 { 165 return binary.BigEndian.Uint64(a[:]) 166 } 167 168 func (a addr) Checksum() uint32 { 169 return binary.BigEndian.Uint32(a[addrSize-checksumSize:]) 170 } 171 172 func parseAddr(str string) (addr, error) { 173 var h addr 174 _, err := encoding.Decode(h[:], []byte(str)) 175 return h, err 176 } 177 178 func ValidateAddr(s string) bool { 179 _, err := encoding.DecodeString(s) 180 return err == nil 181 } 182 183 type addrSlice []addr 184 185 func (hs addrSlice) Len() int { return len(hs) } 186 func (hs addrSlice) Less(i, j int) bool { return bytes.Compare(hs[i][:], hs[j][:]) < 0 } 187 func (hs addrSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 188 189 type hasRecord struct { 190 a *addr 191 prefix uint64 192 order int 193 has bool 194 } 195 196 type hasRecordByPrefix []hasRecord 197 198 func (hs hasRecordByPrefix) Len() int { return len(hs) } 199 func (hs hasRecordByPrefix) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix } 200 func (hs hasRecordByPrefix) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 201 202 type hasRecordByOrder []hasRecord 203 204 func (hs hasRecordByOrder) Len() int { return len(hs) } 205 func (hs hasRecordByOrder) Less(i, j int) bool { return hs[i].order < hs[j].order } 206 func (hs hasRecordByOrder) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 207 208 type getRecord struct { 209 a *addr 210 prefix uint64 211 found bool 212 } 213 214 type getRecordByPrefix []getRecord 215 216 func (hs getRecordByPrefix) Len() int { return len(hs) } 217 func (hs getRecordByPrefix) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix } 218 func (hs getRecordByPrefix) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 219 220 type extractRecord struct { 221 a addr 222 data []byte 223 err error 224 } 225 226 type chunkReader interface { 227 has(h addr) (bool, error) 228 hasMany(addrs []hasRecord) (bool, error) 229 get(ctx context.Context, h addr, stats *Stats) ([]byte, error) 230 getMany(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(*chunks.Chunk), stats *Stats) (bool, error) 231 getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(CompressedChunk), stats *Stats) (bool, error) 232 extract(ctx context.Context, chunks chan<- extractRecord) error 233 count() (uint32, error) 234 uncompressedLen() (uint64, error) 235 236 // Close releases resources retained by the |chunkReader|. 237 Close() error 238 } 239 240 type chunkReadPlanner interface { 241 findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) 242 getManyAtOffsets( 243 ctx context.Context, 244 eg *errgroup.Group, 245 offsetRecords offsetRecSlice, 246 found func(*chunks.Chunk), 247 stats *Stats, 248 ) error 249 getManyCompressedAtOffsets( 250 ctx context.Context, 251 eg *errgroup.Group, 252 offsetRecords offsetRecSlice, 253 found func(CompressedChunk), 254 stats *Stats, 255 ) error 256 } 257 258 type chunkSource interface { 259 chunkReader 260 hash() (addr, error) 261 calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) 262 263 // opens a Reader to the first byte of the chunkData segment of this table. 264 reader(context.Context) (io.Reader, error) 265 index() (tableIndex, error) 266 267 // Clone returns a |chunkSource| with the same contents as the 268 // original, but with independent |Close| behavior. A |chunkSource| 269 // cannot be |Close|d more than once, so if a |chunkSource| is being 270 // retained in two objects with independent life-cycle, it should be 271 // |Clone|d first. 272 Clone() chunkSource 273 } 274 275 type chunkSources []chunkSource 276 277 // TableFile is an interface for working with an existing table file 278 type TableFile interface { 279 // FileID gets the id of the file 280 FileID() string 281 282 // NumChunks returns the number of chunks in a table file 283 NumChunks() int 284 285 // Open returns an io.ReadCloser which can be used to read the bytes of a table file. 286 Open(ctx context.Context) (io.ReadCloser, error) 287 } 288 289 // Describes what is possible to do with TableFiles in a TableFileStore. 290 type TableFileStoreOps struct { 291 // True is the TableFileStore supports reading table files. 292 CanRead bool 293 // True is the TableFileStore supports writing table files. 294 CanWrite bool 295 // True is the TableFileStore supports pruning unused table files. 296 CanPrune bool 297 // True is the TableFileStore supports garbage collecting chunks. 298 CanGC bool 299 } 300 301 // TableFileStore is an interface for interacting with table files directly 302 type TableFileStore interface { 303 // Sources retrieves the current root hash, a list of all the table files (which may include appendix table files), 304 // and a second list containing only appendix table files. 305 Sources(ctx context.Context) (hash.Hash, []TableFile, []TableFile, error) 306 307 // Size returns the total size, in bytes, of the table files in this Store. 308 Size(ctx context.Context) (uint64, error) 309 310 // WriteTableFile will read a table file from the provided reader and write it to the TableFileStore. 311 WriteTableFile(ctx context.Context, fileId string, numChunks int, rd io.Reader, contentLength uint64, contentHash []byte) error 312 313 // PruneTableFiles deletes old table files that are no longer referenced in the manifest. 314 PruneTableFiles(ctx context.Context) error 315 316 // SetRootChunk changes the root chunk hash from the previous value to the new root. 317 SetRootChunk(ctx context.Context, root, previous hash.Hash) error 318 319 // SupportedOperations returns a description of the support TableFile operations. Some stores only support reading table files, not writing. 320 SupportedOperations() TableFileStoreOps 321 }