github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/table_reader.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package nbs 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "io" 11 "sort" 12 "sync" 13 14 "github.com/attic-labs/noms/go/chunks" 15 "github.com/attic-labs/noms/go/d" 16 "github.com/attic-labs/noms/go/hash" 17 "github.com/golang/snappy" 18 ) 19 20 type tableIndex struct { 21 chunkCount uint32 22 totalUncompressedData uint64 23 prefixes, offsets []uint64 24 lengths, ordinals []uint32 25 suffixes []byte 26 } 27 28 type tableReaderAt interface { 29 ReadAtWithStats(p []byte, off int64, stats *Stats) (n int, err error) 30 } 31 32 // tableReader implements get & has queries against a single nbs table. goroutine safe. 33 // |blockSize| refers to the block-size of the underlying storage. We assume that, each time we read data, we actually have to read in blocks of this size. So, we're willing to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group more chunks together into a single read request to backing storage. 34 type tableReader struct { 35 tableIndex 36 r tableReaderAt 37 blockSize uint64 38 } 39 40 // parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. |tableIndex| doesn't keep alive any references to |buff|. 41 func parseTableIndex(buff []byte) tableIndex { 42 pos := uint64(len(buff)) 43 44 // footer 45 pos -= magicNumberSize 46 d.Chk.True(string(buff[pos:]) == magicNumber) 47 48 // total uncompressed chunk data 49 pos -= uint64Size 50 totalUncompressedData := binary.BigEndian.Uint64(buff[pos:]) 51 52 pos -= uint32Size 53 chunkCount := binary.BigEndian.Uint32(buff[pos:]) 54 55 // index 56 suffixesSize := uint64(chunkCount) * addrSuffixSize 57 pos -= suffixesSize 58 suffixes := make([]byte, suffixesSize) 59 copy(suffixes, buff[pos:]) 60 61 lengthsSize := uint64(chunkCount) * lengthSize 62 pos -= lengthsSize 63 lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize]) 64 65 tuplesSize := uint64(chunkCount) * prefixTupleSize 66 pos -= tuplesSize 67 prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize]) 68 69 return tableIndex{ 70 chunkCount, totalUncompressedData, 71 prefixes, offsets, 72 lengths, ordinals, 73 suffixes, 74 } 75 } 76 77 func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) { 78 lengths = make([]uint32, count) 79 offsets = make([]uint64, count) 80 81 lengths[0] = binary.BigEndian.Uint32(buff) 82 83 for i := uint64(1); i < uint64(count); i++ { 84 lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:]) 85 offsets[i] = offsets[i-1] + uint64(lengths[i-1]) 86 } 87 return 88 } 89 90 func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) { 91 prefixes = make([]uint64, count) 92 ordinals = make([]uint32, count) 93 94 for i := uint64(0); i < uint64(count); i++ { 95 idx := i * prefixTupleSize 96 prefixes[i] = binary.BigEndian.Uint64(buff[idx:]) 97 ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:]) 98 } 99 return 100 } 101 102 func (ti tableIndex) prefixIdxToOrdinal(idx uint32) uint32 { 103 return ti.ordinals[idx] 104 } 105 106 // returns the first position in |tr.prefixes| whose value == |prefix|. Returns |tr.chunkCount| 107 // if absent 108 func (ti tableIndex) prefixIdx(prefix uint64) (idx uint32) { 109 // NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in 110 // an extremely tight loop and inlining the code was a significant perf improvement. 111 idx, j := 0, ti.chunkCount 112 for idx < j { 113 h := idx + (j-idx)/2 // avoid overflow when computing h 114 // i ≤ h < j 115 if ti.prefixes[h] < prefix { 116 idx = h + 1 // preserves f(i-1) == false 117 } else { 118 j = h // preserves f(j) == true 119 } 120 } 121 122 return 123 } 124 125 // Return true IFF the suffix at insertion order |ordinal| matches the address |a|. 126 func (ti tableIndex) ordinalSuffixMatches(ordinal uint32, h addr) bool { 127 li := uint64(ordinal) * addrSuffixSize 128 return bytes.Compare(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) == 0 129 } 130 131 // returns the ordinal of |h| if present. returns |ti.chunkCount| if absent 132 func (ti tableIndex) lookupOrdinal(h addr) uint32 { 133 prefix := h.Prefix() 134 135 for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ { 136 ordinal := ti.prefixIdxToOrdinal(idx) 137 if ti.ordinalSuffixMatches(ordinal, h) { 138 return ordinal 139 } 140 } 141 142 return ti.chunkCount 143 } 144 145 // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. r should allow retrieving any desired range of bytes from the table. 146 func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) tableReader { 147 return tableReader{index, r, blockSize} 148 } 149 150 // Scan across (logically) two ordered slices of address prefixes. 151 func (tr tableReader) hasMany(addrs []hasRecord) (remaining bool) { 152 // TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs)) 153 154 filterIdx := uint32(0) 155 filterLen := uint32(len(tr.prefixes)) 156 157 for i, addr := range addrs { 158 if addr.has { 159 continue 160 } 161 162 for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] { 163 filterIdx++ 164 } 165 166 if filterIdx >= filterLen { 167 remaining = true 168 return 169 } 170 171 if addr.prefix != tr.prefixes[filterIdx] { 172 remaining = true 173 continue 174 } 175 176 // prefixes are equal, so locate and compare against the corresponding suffix 177 for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ { 178 if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *addr.a) { 179 addrs[i].has = true 180 break 181 } 182 } 183 184 if !addrs[i].has { 185 remaining = true 186 } 187 } 188 189 return 190 } 191 192 func (tr tableReader) count() uint32 { 193 return tr.chunkCount 194 } 195 196 func (tr tableReader) uncompressedLen() uint64 { 197 return tr.totalUncompressedData 198 } 199 200 func (tr tableReader) index() tableIndex { 201 return tr.tableIndex 202 } 203 204 // returns true iff |h| can be found in this table. 205 func (tr tableReader) has(h addr) bool { 206 ordinal := tr.lookupOrdinal(h) 207 return ordinal < tr.count() 208 } 209 210 // returns the storage associated with |h|, iff present. Returns nil if absent. On success, 211 // the returned byte slice directly references the underlying storage. 212 func (tr tableReader) get(h addr, stats *Stats) (data []byte) { 213 ordinal := tr.lookupOrdinal(h) 214 if ordinal == tr.count() { 215 return 216 } 217 218 offset := tr.offsets[ordinal] 219 length := uint64(tr.lengths[ordinal]) 220 buff := make([]byte, length) // TODO: Avoid this allocation for every get 221 222 n, err := tr.r.ReadAtWithStats(buff, int64(offset), stats) 223 d.Chk.NoError(err) 224 d.Chk.True(n == int(length)) 225 data = tr.parseChunk(buff) 226 d.Chk.True(data != nil) 227 228 return 229 } 230 231 type offsetRec struct { 232 a *addr 233 ordinal uint32 234 offset uint64 235 } 236 237 type offsetRecSlice []offsetRec 238 239 func (hs offsetRecSlice) Len() int { return len(hs) } 240 func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset } 241 func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 242 243 func (tr tableReader) readAtOffsets( 244 readStart, readEnd uint64, 245 reqs []getRecord, 246 offsets offsetRecSlice, 247 foundChunks chan *chunks.Chunk, 248 wg *sync.WaitGroup, 249 stats *Stats, 250 ) { 251 252 readLength := readEnd - readStart 253 buff := make([]byte, readLength) 254 255 n, err := tr.r.ReadAtWithStats(buff, int64(readStart), stats) 256 257 d.Chk.NoError(err) 258 d.Chk.True(uint64(n) == readLength) 259 260 for _, rec := range offsets { 261 d.Chk.True(rec.offset >= readStart) 262 localStart := rec.offset - readStart 263 localEnd := localStart + uint64(tr.lengths[rec.ordinal]) 264 d.Chk.True(localEnd <= readLength) 265 data := tr.parseChunk(buff[localStart:localEnd]) 266 c := chunks.NewChunkWithHash(hash.Hash(*rec.a), data) 267 foundChunks <- &c 268 } 269 270 wg.Done() 271 272 } 273 274 // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical 275 // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix. 276 func (tr tableReader) getMany( 277 reqs []getRecord, 278 foundChunks chan *chunks.Chunk, 279 wg *sync.WaitGroup, 280 stats *Stats, 281 ) (remaining bool) { 282 // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 283 // of table locations which must be read in order to satisfy the getMany operation. 284 offsetRecords, remaining := tr.findOffsets(reqs) 285 tr.getManyAtOffsets(reqs, offsetRecords, foundChunks, wg, stats) 286 return remaining 287 } 288 289 func (tr tableReader) getManyAtOffsets( 290 reqs []getRecord, 291 offsetRecords offsetRecSlice, 292 foundChunks chan *chunks.Chunk, 293 wg *sync.WaitGroup, 294 stats *Stats, 295 ) { 296 // Now |offsetRecords| contains all locations within the table which must be search (note 297 // that there may be duplicates of a particular location). Sort by offset and scan forward, 298 // grouping sequences of reads into large physical reads. 299 300 var batch offsetRecSlice 301 var readStart, readEnd uint64 302 303 for i := 0; i < len(offsetRecords); { 304 rec := offsetRecords[i] 305 length := tr.lengths[rec.ordinal] 306 307 if batch == nil { 308 batch = make(offsetRecSlice, 1) 309 batch[0] = offsetRecords[i] 310 readStart = rec.offset 311 readEnd = readStart + uint64(length) 312 i++ 313 continue 314 } 315 316 if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead { 317 batch = append(batch, rec) 318 readEnd = newReadEnd 319 i++ 320 continue 321 } 322 323 wg.Add(1) 324 go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats) 325 batch = nil 326 } 327 328 if batch != nil { 329 wg.Add(1) 330 go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats) 331 batch = nil 332 } 333 334 return 335 } 336 337 // findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by 338 // address) to build the set of table locations which must be read in order to 339 // find each chunk specified by |reqs|. If this table contains all requested 340 // chunks remaining will be set to false upon return. If some are not here, 341 // then remaining will be true. The result offsetRecSlice is sorted in offset 342 // order. 343 func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) { 344 filterIdx := uint32(0) 345 filterLen := uint32(len(tr.prefixes)) 346 ors = make(offsetRecSlice, 0, len(reqs)) 347 348 // Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 349 // of table locations which must be read in order to satisfy |reqs|. 350 for i, req := range reqs { 351 if req.found { 352 continue 353 } 354 355 // advance within the prefixes until we reach one which is >= req.prefix 356 for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix { 357 filterIdx++ 358 } 359 360 if filterIdx >= filterLen { 361 remaining = true // last prefix visited. 362 break 363 } 364 365 if req.prefix != tr.prefixes[filterIdx] { 366 remaining = true 367 continue 368 } 369 370 // record all offsets within the table which contain the data required. 371 for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ { 372 if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *req.a) { 373 reqs[i].found = true 374 ors = append(ors, offsetRec{req.a, tr.ordinals[j], tr.offsets[tr.ordinals[j]]}) 375 } 376 } 377 } 378 379 sort.Sort(ors) 380 return ors, remaining 381 } 382 383 func canReadAhead(fRec offsetRec, fLength uint32, readStart, readEnd, blockSize uint64) (newEnd uint64, canRead bool) { 384 if fRec.offset < readEnd { 385 // |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address 386 // prefix matches the prefix of a requested address. If the set of requests contains 387 // addresses which share a common prefix, then it's possible for multiple offsetRecords 388 // to reference the same table offset position. In that case, we'll see sequential 389 // offsetRecords with the same fRec.offset. 390 return readEnd, true 391 } 392 393 if fRec.offset-readEnd > blockSize { 394 return readEnd, false 395 } 396 397 return fRec.offset + uint64(fLength), true 398 } 399 400 // Fetches the byte stream of data logically encoded within the table starting at |pos|. 401 func (tr tableReader) parseChunk(buff []byte) []byte { 402 dataLen := uint64(len(buff)) - checksumSize 403 404 chksum := binary.BigEndian.Uint32(buff[dataLen:]) 405 d.Chk.True(chksum == crc(buff[:dataLen])) 406 407 data, err := snappy.Decode(nil, buff[:dataLen]) 408 d.Chk.NoError(err) 409 410 return data 411 } 412 413 func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool) { 414 var offsetRecords offsetRecSlice 415 // Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table. 416 offsetRecords, remaining = tr.findOffsets(reqs) 417 418 // Now |offsetRecords| contains all locations within the table which must 419 // be searched (note that there may be duplicates of a particular 420 // location). Scan forward, grouping sequences of reads into large physical 421 // reads. 422 423 var readStart, readEnd uint64 424 readStarted := false 425 426 for i := 0; i < len(offsetRecords); { 427 rec := offsetRecords[i] 428 length := tr.lengths[rec.ordinal] 429 430 if !readStarted { 431 readStarted = true 432 reads++ 433 readStart = rec.offset 434 readEnd = readStart + uint64(length) 435 i++ 436 continue 437 } 438 439 if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead { 440 readEnd = newReadEnd 441 i++ 442 continue 443 } 444 445 readStarted = false 446 } 447 448 return 449 } 450 451 func (tr tableReader) extract(chunks chan<- extractRecord) { 452 // Build reverse lookup table from ordinal -> chunk hash 453 hashes := make(addrSlice, len(tr.prefixes)) 454 for idx, prefix := range tr.prefixes { 455 ordinal := tr.prefixIdxToOrdinal(uint32(idx)) 456 binary.BigEndian.PutUint64(hashes[ordinal][:], prefix) 457 li := uint64(ordinal) * addrSuffixSize 458 copy(hashes[ordinal][addrPrefixSize:], tr.suffixes[li:li+addrSuffixSize]) 459 } 460 chunkLen := tr.offsets[tr.chunkCount-1] + uint64(tr.lengths[tr.chunkCount-1]) 461 buff := make([]byte, chunkLen) 462 n, err := tr.r.ReadAtWithStats(buff, int64(tr.offsets[0]), &Stats{}) 463 d.Chk.NoError(err) 464 d.Chk.True(uint64(n) == chunkLen) 465 466 sendChunk := func(i uint32) { 467 localOffset := tr.offsets[i] - tr.offsets[0] 468 chunks <- extractRecord{a: hashes[i], data: tr.parseChunk(buff[localOffset : localOffset+uint64(tr.lengths[i])])} 469 } 470 471 for i := uint32(0); i < tr.chunkCount; i++ { 472 sendChunk(i) 473 } 474 } 475 476 func (tr tableReader) reader() io.Reader { 477 return &readerAdapter{tr.r, 0} 478 } 479 480 type readerAdapter struct { 481 rat tableReaderAt 482 off int64 483 } 484 485 func (ra *readerAdapter) Read(p []byte) (n int, err error) { 486 n, err = ra.rat.ReadAtWithStats(p, ra.off, &Stats{}) 487 ra.off += int64(n) 488 return 489 }