github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/table_reader.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package nbs
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"io"
    11  	"sort"
    12  	"sync"
    13  
    14  	"github.com/attic-labs/noms/go/chunks"
    15  	"github.com/attic-labs/noms/go/d"
    16  	"github.com/attic-labs/noms/go/hash"
    17  	"github.com/golang/snappy"
    18  )
    19  
    20  type tableIndex struct {
    21  	chunkCount            uint32
    22  	totalUncompressedData uint64
    23  	prefixes, offsets     []uint64
    24  	lengths, ordinals     []uint32
    25  	suffixes              []byte
    26  }
    27  
    28  type tableReaderAt interface {
    29  	ReadAtWithStats(p []byte, off int64, stats *Stats) (n int, err error)
    30  }
    31  
    32  // tableReader implements get & has queries against a single nbs table. goroutine safe.
    33  // |blockSize| refers to the block-size of the underlying storage. We assume that, each time we read data, we actually have to read in blocks of this size. So, we're willing to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group more chunks together into a single read request to backing storage.
    34  type tableReader struct {
    35  	tableIndex
    36  	r         tableReaderAt
    37  	blockSize uint64
    38  }
    39  
    40  // parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. |tableIndex| doesn't keep alive any references to |buff|.
    41  func parseTableIndex(buff []byte) tableIndex {
    42  	pos := uint64(len(buff))
    43  
    44  	// footer
    45  	pos -= magicNumberSize
    46  	d.Chk.True(string(buff[pos:]) == magicNumber)
    47  
    48  	// total uncompressed chunk data
    49  	pos -= uint64Size
    50  	totalUncompressedData := binary.BigEndian.Uint64(buff[pos:])
    51  
    52  	pos -= uint32Size
    53  	chunkCount := binary.BigEndian.Uint32(buff[pos:])
    54  
    55  	// index
    56  	suffixesSize := uint64(chunkCount) * addrSuffixSize
    57  	pos -= suffixesSize
    58  	suffixes := make([]byte, suffixesSize)
    59  	copy(suffixes, buff[pos:])
    60  
    61  	lengthsSize := uint64(chunkCount) * lengthSize
    62  	pos -= lengthsSize
    63  	lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize])
    64  
    65  	tuplesSize := uint64(chunkCount) * prefixTupleSize
    66  	pos -= tuplesSize
    67  	prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize])
    68  
    69  	return tableIndex{
    70  		chunkCount, totalUncompressedData,
    71  		prefixes, offsets,
    72  		lengths, ordinals,
    73  		suffixes,
    74  	}
    75  }
    76  
    77  func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) {
    78  	lengths = make([]uint32, count)
    79  	offsets = make([]uint64, count)
    80  
    81  	lengths[0] = binary.BigEndian.Uint32(buff)
    82  
    83  	for i := uint64(1); i < uint64(count); i++ {
    84  		lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:])
    85  		offsets[i] = offsets[i-1] + uint64(lengths[i-1])
    86  	}
    87  	return
    88  }
    89  
    90  func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) {
    91  	prefixes = make([]uint64, count)
    92  	ordinals = make([]uint32, count)
    93  
    94  	for i := uint64(0); i < uint64(count); i++ {
    95  		idx := i * prefixTupleSize
    96  		prefixes[i] = binary.BigEndian.Uint64(buff[idx:])
    97  		ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:])
    98  	}
    99  	return
   100  }
   101  
   102  func (ti tableIndex) prefixIdxToOrdinal(idx uint32) uint32 {
   103  	return ti.ordinals[idx]
   104  }
   105  
   106  // returns the first position in |tr.prefixes| whose value == |prefix|. Returns |tr.chunkCount|
   107  // if absent
   108  func (ti tableIndex) prefixIdx(prefix uint64) (idx uint32) {
   109  	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
   110  	// an extremely tight loop and inlining the code was a significant perf improvement.
   111  	idx, j := 0, ti.chunkCount
   112  	for idx < j {
   113  		h := idx + (j-idx)/2 // avoid overflow when computing h
   114  		// i ≤ h < j
   115  		if ti.prefixes[h] < prefix {
   116  			idx = h + 1 // preserves f(i-1) == false
   117  		} else {
   118  			j = h // preserves f(j) == true
   119  		}
   120  	}
   121  
   122  	return
   123  }
   124  
   125  // Return true IFF the suffix at insertion order |ordinal| matches the address |a|.
   126  func (ti tableIndex) ordinalSuffixMatches(ordinal uint32, h addr) bool {
   127  	li := uint64(ordinal) * addrSuffixSize
   128  	return bytes.Compare(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) == 0
   129  }
   130  
   131  // returns the ordinal of |h| if present. returns |ti.chunkCount| if absent
   132  func (ti tableIndex) lookupOrdinal(h addr) uint32 {
   133  	prefix := h.Prefix()
   134  
   135  	for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ {
   136  		ordinal := ti.prefixIdxToOrdinal(idx)
   137  		if ti.ordinalSuffixMatches(ordinal, h) {
   138  			return ordinal
   139  		}
   140  	}
   141  
   142  	return ti.chunkCount
   143  }
   144  
   145  // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. r should allow retrieving any desired range of bytes from the table.
   146  func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) tableReader {
   147  	return tableReader{index, r, blockSize}
   148  }
   149  
   150  // Scan across (logically) two ordered slices of address prefixes.
   151  func (tr tableReader) hasMany(addrs []hasRecord) (remaining bool) {
   152  	// TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs))
   153  
   154  	filterIdx := uint32(0)
   155  	filterLen := uint32(len(tr.prefixes))
   156  
   157  	for i, addr := range addrs {
   158  		if addr.has {
   159  			continue
   160  		}
   161  
   162  		for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] {
   163  			filterIdx++
   164  		}
   165  
   166  		if filterIdx >= filterLen {
   167  			remaining = true
   168  			return
   169  		}
   170  
   171  		if addr.prefix != tr.prefixes[filterIdx] {
   172  			remaining = true
   173  			continue
   174  		}
   175  
   176  		// prefixes are equal, so locate and compare against the corresponding suffix
   177  		for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ {
   178  			if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *addr.a) {
   179  				addrs[i].has = true
   180  				break
   181  			}
   182  		}
   183  
   184  		if !addrs[i].has {
   185  			remaining = true
   186  		}
   187  	}
   188  
   189  	return
   190  }
   191  
   192  func (tr tableReader) count() uint32 {
   193  	return tr.chunkCount
   194  }
   195  
   196  func (tr tableReader) uncompressedLen() uint64 {
   197  	return tr.totalUncompressedData
   198  }
   199  
   200  func (tr tableReader) index() tableIndex {
   201  	return tr.tableIndex
   202  }
   203  
   204  // returns true iff |h| can be found in this table.
   205  func (tr tableReader) has(h addr) bool {
   206  	ordinal := tr.lookupOrdinal(h)
   207  	return ordinal < tr.count()
   208  }
   209  
   210  // returns the storage associated with |h|, iff present. Returns nil if absent. On success,
   211  // the returned byte slice directly references the underlying storage.
   212  func (tr tableReader) get(h addr, stats *Stats) (data []byte) {
   213  	ordinal := tr.lookupOrdinal(h)
   214  	if ordinal == tr.count() {
   215  		return
   216  	}
   217  
   218  	offset := tr.offsets[ordinal]
   219  	length := uint64(tr.lengths[ordinal])
   220  	buff := make([]byte, length) // TODO: Avoid this allocation for every get
   221  
   222  	n, err := tr.r.ReadAtWithStats(buff, int64(offset), stats)
   223  	d.Chk.NoError(err)
   224  	d.Chk.True(n == int(length))
   225  	data = tr.parseChunk(buff)
   226  	d.Chk.True(data != nil)
   227  
   228  	return
   229  }
   230  
   231  type offsetRec struct {
   232  	a       *addr
   233  	ordinal uint32
   234  	offset  uint64
   235  }
   236  
   237  type offsetRecSlice []offsetRec
   238  
   239  func (hs offsetRecSlice) Len() int           { return len(hs) }
   240  func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset }
   241  func (hs offsetRecSlice) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   242  
   243  func (tr tableReader) readAtOffsets(
   244  	readStart, readEnd uint64,
   245  	reqs []getRecord,
   246  	offsets offsetRecSlice,
   247  	foundChunks chan *chunks.Chunk,
   248  	wg *sync.WaitGroup,
   249  	stats *Stats,
   250  ) {
   251  
   252  	readLength := readEnd - readStart
   253  	buff := make([]byte, readLength)
   254  
   255  	n, err := tr.r.ReadAtWithStats(buff, int64(readStart), stats)
   256  
   257  	d.Chk.NoError(err)
   258  	d.Chk.True(uint64(n) == readLength)
   259  
   260  	for _, rec := range offsets {
   261  		d.Chk.True(rec.offset >= readStart)
   262  		localStart := rec.offset - readStart
   263  		localEnd := localStart + uint64(tr.lengths[rec.ordinal])
   264  		d.Chk.True(localEnd <= readLength)
   265  		data := tr.parseChunk(buff[localStart:localEnd])
   266  		c := chunks.NewChunkWithHash(hash.Hash(*rec.a), data)
   267  		foundChunks <- &c
   268  	}
   269  
   270  	wg.Done()
   271  
   272  }
   273  
   274  // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical
   275  // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix.
   276  func (tr tableReader) getMany(
   277  	reqs []getRecord,
   278  	foundChunks chan *chunks.Chunk,
   279  	wg *sync.WaitGroup,
   280  	stats *Stats,
   281  ) (remaining bool) {
   282  	// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
   283  	// of table locations which must be read in order to satisfy the getMany operation.
   284  	offsetRecords, remaining := tr.findOffsets(reqs)
   285  	tr.getManyAtOffsets(reqs, offsetRecords, foundChunks, wg, stats)
   286  	return remaining
   287  }
   288  
   289  func (tr tableReader) getManyAtOffsets(
   290  	reqs []getRecord,
   291  	offsetRecords offsetRecSlice,
   292  	foundChunks chan *chunks.Chunk,
   293  	wg *sync.WaitGroup,
   294  	stats *Stats,
   295  ) {
   296  	// Now |offsetRecords| contains all locations within the table which must be search (note
   297  	// that there may be duplicates of a particular location). Sort by offset and scan forward,
   298  	// grouping sequences of reads into large physical reads.
   299  
   300  	var batch offsetRecSlice
   301  	var readStart, readEnd uint64
   302  
   303  	for i := 0; i < len(offsetRecords); {
   304  		rec := offsetRecords[i]
   305  		length := tr.lengths[rec.ordinal]
   306  
   307  		if batch == nil {
   308  			batch = make(offsetRecSlice, 1)
   309  			batch[0] = offsetRecords[i]
   310  			readStart = rec.offset
   311  			readEnd = readStart + uint64(length)
   312  			i++
   313  			continue
   314  		}
   315  
   316  		if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
   317  			batch = append(batch, rec)
   318  			readEnd = newReadEnd
   319  			i++
   320  			continue
   321  		}
   322  
   323  		wg.Add(1)
   324  		go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats)
   325  		batch = nil
   326  	}
   327  
   328  	if batch != nil {
   329  		wg.Add(1)
   330  		go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats)
   331  		batch = nil
   332  	}
   333  
   334  	return
   335  }
   336  
   337  // findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by
   338  // address) to build the set of table locations which must be read in order to
   339  // find each chunk specified by |reqs|. If this table contains all requested
   340  // chunks remaining will be set to false upon return. If some are not here,
   341  // then remaining will be true. The result offsetRecSlice is sorted in offset
   342  // order.
   343  func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) {
   344  	filterIdx := uint32(0)
   345  	filterLen := uint32(len(tr.prefixes))
   346  	ors = make(offsetRecSlice, 0, len(reqs))
   347  
   348  	// Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
   349  	// of table locations which must be read in order to satisfy |reqs|.
   350  	for i, req := range reqs {
   351  		if req.found {
   352  			continue
   353  		}
   354  
   355  		// advance within the prefixes until we reach one which is >= req.prefix
   356  		for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix {
   357  			filterIdx++
   358  		}
   359  
   360  		if filterIdx >= filterLen {
   361  			remaining = true // last prefix visited.
   362  			break
   363  		}
   364  
   365  		if req.prefix != tr.prefixes[filterIdx] {
   366  			remaining = true
   367  			continue
   368  		}
   369  
   370  		// record all offsets within the table which contain the data required.
   371  		for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ {
   372  			if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *req.a) {
   373  				reqs[i].found = true
   374  				ors = append(ors, offsetRec{req.a, tr.ordinals[j], tr.offsets[tr.ordinals[j]]})
   375  			}
   376  		}
   377  	}
   378  
   379  	sort.Sort(ors)
   380  	return ors, remaining
   381  }
   382  
   383  func canReadAhead(fRec offsetRec, fLength uint32, readStart, readEnd, blockSize uint64) (newEnd uint64, canRead bool) {
   384  	if fRec.offset < readEnd {
   385  		// |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address
   386  		// prefix matches the prefix of a requested address. If the set of requests contains
   387  		// addresses which share a common prefix, then it's possible for multiple offsetRecords
   388  		// to reference the same table offset position. In that case, we'll see sequential
   389  		// offsetRecords with the same fRec.offset.
   390  		return readEnd, true
   391  	}
   392  
   393  	if fRec.offset-readEnd > blockSize {
   394  		return readEnd, false
   395  	}
   396  
   397  	return fRec.offset + uint64(fLength), true
   398  }
   399  
   400  // Fetches the byte stream of data logically encoded within the table starting at |pos|.
   401  func (tr tableReader) parseChunk(buff []byte) []byte {
   402  	dataLen := uint64(len(buff)) - checksumSize
   403  
   404  	chksum := binary.BigEndian.Uint32(buff[dataLen:])
   405  	d.Chk.True(chksum == crc(buff[:dataLen]))
   406  
   407  	data, err := snappy.Decode(nil, buff[:dataLen])
   408  	d.Chk.NoError(err)
   409  
   410  	return data
   411  }
   412  
   413  func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool) {
   414  	var offsetRecords offsetRecSlice
   415  	// Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table.
   416  	offsetRecords, remaining = tr.findOffsets(reqs)
   417  
   418  	// Now |offsetRecords| contains all locations within the table which must
   419  	// be searched (note that there may be duplicates of a particular
   420  	// location). Scan forward, grouping sequences of reads into large physical
   421  	// reads.
   422  
   423  	var readStart, readEnd uint64
   424  	readStarted := false
   425  
   426  	for i := 0; i < len(offsetRecords); {
   427  		rec := offsetRecords[i]
   428  		length := tr.lengths[rec.ordinal]
   429  
   430  		if !readStarted {
   431  			readStarted = true
   432  			reads++
   433  			readStart = rec.offset
   434  			readEnd = readStart + uint64(length)
   435  			i++
   436  			continue
   437  		}
   438  
   439  		if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
   440  			readEnd = newReadEnd
   441  			i++
   442  			continue
   443  		}
   444  
   445  		readStarted = false
   446  	}
   447  
   448  	return
   449  }
   450  
   451  func (tr tableReader) extract(chunks chan<- extractRecord) {
   452  	// Build reverse lookup table from ordinal -> chunk hash
   453  	hashes := make(addrSlice, len(tr.prefixes))
   454  	for idx, prefix := range tr.prefixes {
   455  		ordinal := tr.prefixIdxToOrdinal(uint32(idx))
   456  		binary.BigEndian.PutUint64(hashes[ordinal][:], prefix)
   457  		li := uint64(ordinal) * addrSuffixSize
   458  		copy(hashes[ordinal][addrPrefixSize:], tr.suffixes[li:li+addrSuffixSize])
   459  	}
   460  	chunkLen := tr.offsets[tr.chunkCount-1] + uint64(tr.lengths[tr.chunkCount-1])
   461  	buff := make([]byte, chunkLen)
   462  	n, err := tr.r.ReadAtWithStats(buff, int64(tr.offsets[0]), &Stats{})
   463  	d.Chk.NoError(err)
   464  	d.Chk.True(uint64(n) == chunkLen)
   465  
   466  	sendChunk := func(i uint32) {
   467  		localOffset := tr.offsets[i] - tr.offsets[0]
   468  		chunks <- extractRecord{a: hashes[i], data: tr.parseChunk(buff[localOffset : localOffset+uint64(tr.lengths[i])])}
   469  	}
   470  
   471  	for i := uint32(0); i < tr.chunkCount; i++ {
   472  		sendChunk(i)
   473  	}
   474  }
   475  
   476  func (tr tableReader) reader() io.Reader {
   477  	return &readerAdapter{tr.r, 0}
   478  }
   479  
   480  type readerAdapter struct {
   481  	rat tableReaderAt
   482  	off int64
   483  }
   484  
   485  func (ra *readerAdapter) Read(p []byte) (n int, err error) {
   486  	n, err = ra.rat.ReadAtWithStats(p, ra.off, &Stats{})
   487  	ra.off += int64(n)
   488  	return
   489  }