github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/journal_index_record.go (about)

     1  // Copyright 2023 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"bufio"
    19  	"encoding/binary"
    20  	"errors"
    21  	"fmt"
    22  	"hash/crc32"
    23  	"io"
    24  
    25  	"github.com/dolthub/dolt/go/store/d"
    26  	"github.com/dolthub/dolt/go/store/hash"
    27  )
    28  
    29  // indexRec is a record in a chunk journal index file. Index records
    30  // serve as out-of-band chunk indexes into the chunk journal that allow
    31  // bootstrapping the journal without reading each record in the journal.
    32  //
    33  // Like journalRec, its serialization format uses uint8 tag prefixes
    34  // to identify fields and allow for format evolution.
    35  type indexRec struct {
    36  	// index record length
    37  	length uint32
    38  
    39  	// root hash of commit when this index record was written
    40  	lastRoot hash.Hash
    41  
    42  	// file offsets for the region of the journal file
    43  	// that |payload| indexes. end points to a root hash
    44  	// record in the journal containing |lastRoot|.
    45  	// we expect a sequence of index records to cover
    46  	// contiguous regions of the journal file.
    47  	start, end uint64
    48  
    49  	// index record kind
    50  	kind indexRecKind
    51  
    52  	// encoded chunk index
    53  	payload []byte
    54  
    55  	// index record crc32 checksum
    56  	checksum uint32
    57  }
    58  
    59  type indexRecKind uint8
    60  
    61  const (
    62  	unknownIndexRecKind indexRecKind = 0
    63  	tableIndexRecKind   indexRecKind = 1
    64  )
    65  
    66  type indexRecTag uint8
    67  
    68  const (
    69  	unknownIndexRecTag     indexRecTag = 0
    70  	lastRootIndexRecTag    indexRecTag = 1
    71  	startOffsetIndexRecTag indexRecTag = 2
    72  	endOffsetIndexRecTag   indexRecTag = 3
    73  	kindIndexRecTag        indexRecTag = 4
    74  	payloadIndexRecTag     indexRecTag = 5
    75  )
    76  
    77  const (
    78  	indexRecTagSz      = 1
    79  	indexRecLenSz      = 4
    80  	indexRecKindSz     = 1
    81  	indexRecLastRootSz = 20
    82  	indexRecOffsetSz   = 8
    83  	indexRecChecksumSz = 4
    84  	lookupSz           = 16 + uint64Size + uint32Size
    85  	lookupMetaSz       = uint64Size + uint64Size + uint32Size + hash.ByteLen
    86  )
    87  
    88  func journalIndexRecordSize(idx []byte) (recordSz uint32) {
    89  	recordSz += indexRecLenSz
    90  	recordSz += indexRecTagSz + indexRecLastRootSz
    91  	recordSz += indexRecTagSz + indexRecOffsetSz
    92  	recordSz += indexRecTagSz + indexRecOffsetSz
    93  	recordSz += indexRecTagSz + indexRecKindSz
    94  	recordSz += indexRecTagSz // payload tag
    95  	recordSz += uint32(len(idx))
    96  	recordSz += indexRecChecksumSz
    97  	return
    98  }
    99  
   100  func writeJournalIndexRecord(buf []byte, root hash.Hash, start, end uint64, idx []byte) (n uint32) {
   101  	//defer trace.StartRegion(ctx, "writeJournalIndexRecord").End()
   102  
   103  	// length
   104  	l := journalIndexRecordSize(idx)
   105  	writeUint32(buf[:indexRecLenSz], l)
   106  	n += indexRecLenSz
   107  	// last root
   108  	buf[n] = byte(lastRootIndexRecTag)
   109  	n += indexRecTagSz
   110  	copy(buf[n:], root[:])
   111  	n += indexRecLastRootSz
   112  	// start offset
   113  	buf[n] = byte(startOffsetIndexRecTag)
   114  	n += indexRecTagSz
   115  	writeUint64(buf[n:], start)
   116  	n += indexRecOffsetSz
   117  	// end offset
   118  	buf[n] = byte(endOffsetIndexRecTag)
   119  	n += indexRecTagSz
   120  	writeUint64(buf[n:], end)
   121  	n += indexRecOffsetSz
   122  	// kind
   123  	buf[n] = byte(kindIndexRecTag)
   124  	n += indexRecTagSz
   125  	buf[n] = byte(tableIndexRecKind)
   126  	n += indexRecKindSz
   127  	// payload
   128  	buf[n] = byte(payloadIndexRecTag)
   129  	n += indexRecTagSz
   130  	copy(buf[n:], idx)
   131  	n += uint32(len(idx))
   132  	// checksum
   133  	writeUint32(buf[n:], crc(buf[:n]))
   134  	n += indexRecChecksumSz
   135  	d.PanicIfFalse(l == n)
   136  	return
   137  }
   138  
   139  func readJournalIndexRecord(buf []byte) (rec indexRec, err error) {
   140  	rec.length = readUint32(buf)
   141  	buf = buf[indexRecLenSz:]
   142  	for len(buf) > indexRecChecksumSz {
   143  		tag := indexRecTag(buf[0])
   144  		buf = buf[indexRecTagSz:]
   145  		switch tag {
   146  		case lastRootIndexRecTag:
   147  			copy(rec.lastRoot[:], buf)
   148  			buf = buf[indexRecLastRootSz:]
   149  		case startOffsetIndexRecTag:
   150  			rec.start = readUint64(buf)
   151  			buf = buf[indexRecOffsetSz:]
   152  		case endOffsetIndexRecTag:
   153  			rec.end = readUint64(buf)
   154  			buf = buf[indexRecOffsetSz:]
   155  		case kindIndexRecTag:
   156  			rec.kind = indexRecKind(buf[0])
   157  			buf = buf[indexRecKindSz:]
   158  		case payloadIndexRecTag:
   159  			sz := len(buf) - indexRecChecksumSz
   160  			rec.payload = buf[:sz]
   161  			buf = buf[sz:]
   162  		case unknownIndexRecTag:
   163  			fallthrough
   164  		default:
   165  			err = fmt.Errorf("unknown record field tag: %d", tag)
   166  			return
   167  		}
   168  	}
   169  	rec.checksum = readUint32(buf[:indexRecChecksumSz])
   170  	return
   171  }
   172  
   173  func validateIndexRecord(buf []byte) bool {
   174  	if len(buf) < (indexRecLenSz + indexRecChecksumSz) {
   175  		return false
   176  	}
   177  	off := readUint32(buf)
   178  	if int(off) > len(buf) {
   179  		return false
   180  	}
   181  	off -= indexRecChecksumSz
   182  	return crc(buf[:off]) == readUint32(buf[off:])
   183  }
   184  
   185  type lookupMeta struct {
   186  	batchStart int64
   187  	batchEnd   int64
   188  	checkSum   uint32
   189  	latestHash hash.Hash
   190  }
   191  
   192  const indexRecTypeSize = 1
   193  const (
   194  	indexRecChunk byte = iota
   195  	indexRecMeta
   196  )
   197  
   198  // processIndexRecords reads batches of chunk index lookups into the journal.
   199  // An index batch looks like |lookup|lookup|...|meta|. The first byte of a record
   200  // indicates whether it is a |lookup| or |meta|. Only callback errors are returned.
   201  // The caller is expected to track the latest lookupMeta end offset and truncate
   202  // the index to compensate for partially written batches.
   203  func processIndexRecords(rd *bufio.Reader, sz int64, cb func(lookupMeta, []lookup, uint32) error) (off int64, err error) {
   204  	var batchCrc uint32
   205  	var batch []lookup
   206  	var batchOff int64
   207  	for off < sz {
   208  		recTag, err := rd.ReadByte()
   209  		if err != nil {
   210  			return off, nil
   211  		}
   212  		batchOff += 1
   213  
   214  		switch recTag {
   215  		case indexRecChunk:
   216  			l, err := readIndexLookup(rd)
   217  			if err != nil {
   218  				return off, nil
   219  			}
   220  			batchOff += lookupSz
   221  			batch = append(batch, l)
   222  			batchCrc = crc32.Update(batchCrc, crcTable, l.a[:])
   223  
   224  		case indexRecMeta:
   225  			m, err := readIndexMeta(rd)
   226  			if err != nil {
   227  				return off, nil
   228  			}
   229  			if err := cb(m, batch, batchCrc); err != nil {
   230  				return off, err
   231  			}
   232  			batch = nil
   233  			batchCrc = 0
   234  			off += batchOff + lookupMetaSz
   235  			batchOff = 0
   236  		default:
   237  			return off, ErrMalformedIndex
   238  		}
   239  	}
   240  	return off, nil
   241  }
   242  
   243  var ErrMalformedIndex = errors.New("journal index is malformed")
   244  
   245  // readIndexLookup reads a sequence of |chunkAddress|journalOffset|chunkLength|
   246  // that is used to speed up |journal.ranges| initialization.
   247  func readIndexLookup(r *bufio.Reader) (lookup, error) {
   248  	addr := addr16{}
   249  	if _, err := io.ReadFull(r, addr[:]); err != nil {
   250  		return lookup{}, err
   251  	}
   252  
   253  	var offsetBuf [uint64Size]byte
   254  	if _, err := io.ReadFull(r, offsetBuf[:]); err != nil {
   255  		return lookup{}, err
   256  	}
   257  	offset := binary.BigEndian.Uint64(offsetBuf[:])
   258  
   259  	var lengthBuf [uint32Size]byte
   260  	if _, err := io.ReadFull(r, lengthBuf[:]); err != nil {
   261  		return lookup{}, err
   262  	}
   263  	length := binary.BigEndian.Uint32(lengthBuf[:])
   264  
   265  	return lookup{a: addr, r: Range{Offset: offset, Length: length}}, nil
   266  }
   267  
   268  // readIndexMeta reads a sequence of |journalStart|journalEnd|lastRootHash|checksum|
   269  // that is used to validate a range of lookups on read. A corrupted lookup in the
   270  // start-end range will cause the checksum/crc check to fail. The last root hash
   271  // is a duplicate sanity check.
   272  func readIndexMeta(r *bufio.Reader) (lookupMeta, error) {
   273  	var startBuf [offsetSize]byte
   274  	if _, err := io.ReadFull(r, startBuf[:]); err != nil {
   275  		return lookupMeta{}, err
   276  	}
   277  	startOff := binary.BigEndian.Uint64(startBuf[:])
   278  
   279  	var endBuf [offsetSize]byte
   280  	if _, err := io.ReadFull(r, endBuf[:]); err != nil {
   281  		return lookupMeta{}, err
   282  	}
   283  	endOff := binary.BigEndian.Uint64(endBuf[:])
   284  
   285  	var checksumBuf [checksumSize]byte
   286  	if _, err := io.ReadFull(r, checksumBuf[:]); err != nil {
   287  		return lookupMeta{}, err
   288  	}
   289  	checksum := binary.BigEndian.Uint32(checksumBuf[:])
   290  
   291  	addr := hash.Hash{}
   292  	if _, err := io.ReadFull(r, addr[:]); err != nil {
   293  		return lookupMeta{}, err
   294  	}
   295  
   296  	return lookupMeta{
   297  		batchStart: int64(startOff),
   298  		batchEnd:   int64(endOff),
   299  		checkSum:   checksum,
   300  		latestHash: addr,
   301  	}, nil
   302  }
   303  
   304  func writeIndexLookup(w *bufio.Writer, l lookup) error {
   305  	w.WriteByte(indexRecChunk)
   306  
   307  	if _, err := w.Write(l.a[:]); err != nil {
   308  		return err
   309  	}
   310  
   311  	var offsetBuf [offsetSize]byte
   312  	binary.BigEndian.PutUint64(offsetBuf[:], l.r.Offset)
   313  	if _, err := w.Write(offsetBuf[:]); err != nil {
   314  		return err
   315  	}
   316  
   317  	var lengthBuf [lengthSize]byte
   318  	binary.BigEndian.PutUint32(lengthBuf[:], l.r.Length)
   319  	if _, err := w.Write(lengthBuf[:]); err != nil {
   320  		return err
   321  	}
   322  
   323  	return nil
   324  }
   325  
   326  // writeJournalIndexMeta writes a metadata record for an index range to verify
   327  // index bootstrapping integrity. Includes the range of index lookups, a CRC
   328  // checksum, and the latest root hash before |end|.
   329  func writeJournalIndexMeta(w *bufio.Writer, root hash.Hash, start, end int64, checksum uint32) error {
   330  	// |journal start|journal end|last root hash|range checkSum|
   331  
   332  	if err := w.WriteByte(indexRecMeta); err != nil {
   333  		return err
   334  	}
   335  
   336  	startBuf := make([]byte, offsetSize)
   337  	binary.BigEndian.PutUint64(startBuf, uint64(start))
   338  	if _, err := w.Write(startBuf); err != nil {
   339  		return err
   340  	}
   341  
   342  	endBuf := make([]byte, offsetSize)
   343  	binary.BigEndian.PutUint64(endBuf, uint64(end))
   344  	if _, err := w.Write(endBuf); err != nil {
   345  		return err
   346  	}
   347  
   348  	checksumBuf := make([]byte, checksumSize)
   349  	binary.BigEndian.PutUint32(checksumBuf, checksum)
   350  	if _, err := w.Write(checksumBuf); err != nil {
   351  		return err
   352  	}
   353  
   354  	if _, err := w.Write(root[:]); err != nil {
   355  		return err
   356  	}
   357  
   358  	return nil
   359  }
   360  
   361  type lookup struct {
   362  	a addr16
   363  	r Range
   364  }