github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/journal_record.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"bufio"
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"time"
    25  
    26  	"github.com/dolthub/dolt/go/store/d"
    27  	"github.com/dolthub/dolt/go/store/hash"
    28  )
    29  
    30  // journalRec is a record in a chunk journal. Its serialization format uses
    31  // uint8 tag prefixes to identify fields and allow for format evolution.
    32  //
    33  // There are two kinds of journalRecs: chunk records and root hash records.
    34  // Chunk records store chunks from persisted memTables. Root hash records
    35  // store root hash updates to the manifest state.
    36  // Future records kinds may include other updates to manifest state such as
    37  // updates to GC generation or the table set lock hash.
    38  //
    39  // +-----------------+-------+---------+-----+-------------------+
    40  // | length (uint32) | tag 0 | field 0 | ... | checksum (uint32) |
    41  // +-----------------+-------+---------+-----+-------------------+
    42  //
    43  // Currently, the payload field is always written as the penultimate field,
    44  // followed only by the fixed-width record checksum. This allows the payload
    45  // to be extracted from the journalRec using only the record length and payload
    46  // offset. See recLookup for more detail.
    47  type journalRec struct {
    48  	length    uint32
    49  	kind      journalRecKind
    50  	address   hash.Hash
    51  	payload   []byte
    52  	timestamp time.Time
    53  	checksum  uint32
    54  }
    55  
    56  // payloadOffset returns the journalOffset of the payload within the record
    57  // assuming only the checksum field follows the payload.
    58  func (r journalRec) payloadOffset() uint32 {
    59  	return r.length - uint32(len(r.payload)+journalRecChecksumSz)
    60  }
    61  
    62  // uncompressedPayloadSize returns the uncompressed size of the payload.
    63  func (r journalRec) uncompressedPayloadSize() (sz uint64) {
    64  	// |r.payload| is snappy-encoded and starts with
    65  	// the uvarint-encoded uncompressed data size
    66  	sz, _ = binary.Uvarint(r.payload)
    67  	return
    68  }
    69  
    70  type journalRecKind uint8
    71  
    72  const (
    73  	unknownJournalRecKind  journalRecKind = 0
    74  	rootHashJournalRecKind journalRecKind = 1
    75  	chunkJournalRecKind    journalRecKind = 2
    76  )
    77  
    78  type journalRecTag uint8
    79  
    80  const (
    81  	unknownJournalRecTag   journalRecTag = 0
    82  	kindJournalRecTag      journalRecTag = 1
    83  	addrJournalRecTag      journalRecTag = 2
    84  	payloadJournalRecTag   journalRecTag = 3
    85  	timestampJournalRecTag journalRecTag = 4
    86  )
    87  
    88  const (
    89  	journalRecTagSz       = 1
    90  	journalRecLenSz       = 4
    91  	journalRecKindSz      = 1
    92  	journalRecAddrSz      = 20
    93  	journalRecChecksumSz  = 4
    94  	journalRecTimestampSz = 8
    95  
    96  	// todo(andy): less arbitrary
    97  	journalRecMaxSz = 128 * 1024
    98  )
    99  
   100  // journalRecordTimestampGenerator returns the current time in Unix epoch seconds. This function is stored in a
   101  // variable so that unit tests can override it to ensure the journal record timestamps are a known, expected value.
   102  var journalRecordTimestampGenerator = func() uint64 {
   103  	return uint64(time.Now().Unix())
   104  }
   105  
   106  func chunkRecordSize(c CompressedChunk) (recordSz, payloadOff uint32) {
   107  	recordSz += journalRecLenSz
   108  	recordSz += journalRecTagSz + journalRecKindSz
   109  	recordSz += journalRecTagSz + journalRecAddrSz
   110  	recordSz += journalRecTagSz // payload tag
   111  	payloadOff = recordSz
   112  	recordSz += uint32(len(c.FullCompressedChunk))
   113  	recordSz += journalRecChecksumSz
   114  	return
   115  }
   116  
   117  func rootHashRecordSize() (recordSz int) {
   118  	recordSz += journalRecLenSz
   119  	recordSz += journalRecTagSz + journalRecKindSz
   120  	recordSz += journalRecTagSz + journalRecAddrSz
   121  	recordSz += journalRecTagSz + journalRecTimestampSz
   122  	recordSz += journalRecChecksumSz
   123  	return
   124  }
   125  
   126  func writeChunkRecord(buf []byte, c CompressedChunk) (n uint32) {
   127  	// length
   128  	l, _ := chunkRecordSize(c)
   129  	writeUint32(buf[:journalRecLenSz], l)
   130  	n += journalRecLenSz
   131  	// kind
   132  	buf[n] = byte(kindJournalRecTag)
   133  	n += journalRecTagSz
   134  	buf[n] = byte(chunkJournalRecKind)
   135  	n += journalRecKindSz
   136  	// address
   137  	buf[n] = byte(addrJournalRecTag)
   138  	n += journalRecTagSz
   139  	copy(buf[n:], c.H[:])
   140  	n += journalRecAddrSz
   141  	// payload
   142  	buf[n] = byte(payloadJournalRecTag)
   143  	n += journalRecTagSz
   144  	copy(buf[n:], c.FullCompressedChunk)
   145  	n += uint32(len(c.FullCompressedChunk))
   146  	// checksum
   147  	writeUint32(buf[n:], crc(buf[:n]))
   148  	n += journalRecChecksumSz
   149  	d.PanicIfFalse(l == n)
   150  	return
   151  }
   152  
   153  func writeRootHashRecord(buf []byte, root hash.Hash) (n uint32) {
   154  	// length
   155  	l := rootHashRecordSize()
   156  	writeUint32(buf[:journalRecLenSz], uint32(l))
   157  	n += journalRecLenSz
   158  
   159  	// kind
   160  	buf[n] = byte(kindJournalRecTag)
   161  	n += journalRecTagSz
   162  	buf[n] = byte(rootHashJournalRecKind)
   163  	n += journalRecKindSz
   164  
   165  	// timestamp
   166  	buf[n] = byte(timestampJournalRecTag)
   167  	n += journalRecTagSz
   168  	writeUint64(buf[n:], journalRecordTimestampGenerator())
   169  	n += journalRecTimestampSz
   170  
   171  	// address
   172  	buf[n] = byte(addrJournalRecTag)
   173  	n += journalRecTagSz
   174  	copy(buf[n:], root[:])
   175  	n += journalRecAddrSz
   176  
   177  	// empty payload
   178  
   179  	// checksum
   180  	writeUint32(buf[n:], crc(buf[:n]))
   181  	n += journalRecChecksumSz
   182  	return
   183  }
   184  
   185  func readJournalRecord(buf []byte) (rec journalRec, err error) {
   186  	rec.length = readUint32(buf)
   187  	buf = buf[journalRecLenSz:]
   188  	for len(buf) > journalRecChecksumSz {
   189  		tag := journalRecTag(buf[0])
   190  		buf = buf[journalRecTagSz:]
   191  		switch tag {
   192  		case kindJournalRecTag:
   193  			rec.kind = journalRecKind(buf[0])
   194  			buf = buf[journalRecKindSz:]
   195  		case addrJournalRecTag:
   196  			copy(rec.address[:], buf)
   197  			buf = buf[journalRecAddrSz:]
   198  		case timestampJournalRecTag:
   199  			unixSeconds := readUint64(buf)
   200  			rec.timestamp = time.Unix(int64(unixSeconds), 0)
   201  			buf = buf[journalRecTimestampSz:]
   202  		case payloadJournalRecTag:
   203  			sz := len(buf) - journalRecChecksumSz
   204  			rec.payload = buf[:sz]
   205  			buf = buf[sz:]
   206  		case unknownJournalRecTag:
   207  			fallthrough
   208  		default:
   209  			err = fmt.Errorf("unknown record field tag: %d", tag)
   210  			return
   211  		}
   212  	}
   213  	rec.checksum = readUint32(buf[:journalRecChecksumSz])
   214  	return
   215  }
   216  
   217  func validateJournalRecord(buf []byte) bool {
   218  	if len(buf) < (journalRecLenSz + journalRecChecksumSz) {
   219  		return false
   220  	}
   221  	off := readUint32(buf)
   222  	if int(off) > len(buf) {
   223  		return false
   224  	}
   225  	off -= indexRecChecksumSz
   226  	return crc(buf[:off]) == readUint32(buf[off:])
   227  }
   228  
   229  // processJournalRecords iterates over a chunk journal's records by reading from disk using |r|, starting at
   230  // offset |off|, and calls the callback function |cb| with each journal record. The offset where reading was stopped
   231  // is returned, or any error encountered along the way.
   232  func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb func(o int64, r journalRec) error) (int64, error) {
   233  	var (
   234  		buf []byte
   235  		err error
   236  	)
   237  
   238  	// start processing records from |off|
   239  	if _, err = r.Seek(off, io.SeekStart); err != nil {
   240  		return 0, err
   241  	}
   242  
   243  	rdr := bufio.NewReaderSize(r, journalWriterBuffSize)
   244  	for {
   245  		// peek to read next record size
   246  		if buf, err = rdr.Peek(uint32Size); err != nil {
   247  			break
   248  		}
   249  
   250  		l := readUint32(buf)
   251  		if l > journalRecMaxSz {
   252  			break
   253  		} else if buf, err = rdr.Peek(int(l)); err != nil {
   254  			break
   255  		}
   256  
   257  		if !validateJournalRecord(buf) {
   258  			break // stop if we can't validate |rec|
   259  		}
   260  
   261  		var rec journalRec
   262  		if rec, err = readJournalRecord(buf); err != nil {
   263  			break // failed to read valid record
   264  		}
   265  		if err = cb(off, rec); err != nil {
   266  			break
   267  		}
   268  
   269  		// advance |rdr| state by |l| bytes
   270  		if _, err = io.ReadFull(rdr, buf); err != nil {
   271  			break
   272  		}
   273  		off += int64(len(buf))
   274  	}
   275  	if err != nil && err != io.EOF {
   276  		return 0, err
   277  	}
   278  	// reset the file pointer to end of the last
   279  	// successfully processed journal record
   280  	if _, err = r.Seek(off, 0); err != nil {
   281  		return 0, err
   282  	}
   283  	return off, nil
   284  }
   285  
   286  func peekRootHashAt(journal io.ReaderAt, offset int64) (root hash.Hash, err error) {
   287  	expSz := rootHashRecordSize()
   288  	buf := make([]byte, expSz) // assumes len(rec) is exactly rootHashRecordSize
   289  	n, err := journal.ReadAt(buf, offset)
   290  	if errors.Is(err, io.EOF) {
   291  		err = nil // EOF is expected for last record
   292  	} else if err != nil {
   293  		return
   294  	} else if n != expSz {
   295  		err = fmt.Errorf("invalid root hash record at %d: %d", offset, n)
   296  		return
   297  	}
   298  	sz := readUint32(buf)
   299  	if sz > uint32(expSz) {
   300  		err = fmt.Errorf("invalid root hash record size at %d", offset)
   301  		return
   302  	}
   303  	buf = buf[:sz]
   304  	if !validateIndexRecord(buf) {
   305  		err = fmt.Errorf("failed to validate root hash record at %d", offset)
   306  		return
   307  	}
   308  	var rec journalRec
   309  	if rec, err = readJournalRecord(buf); err != nil {
   310  		return
   311  	} else if rec.kind != rootHashJournalRecKind {
   312  		err = fmt.Errorf("expected root hash record, got kind: %d", rec.kind)
   313  		return
   314  	}
   315  	return hash.Hash(rec.address), nil
   316  }
   317  
   318  func readUint32(buf []byte) uint32 {
   319  	return binary.BigEndian.Uint32(buf)
   320  }
   321  
   322  func writeUint32(buf []byte, u uint32) {
   323  	binary.BigEndian.PutUint32(buf, u)
   324  }
   325  
   326  func readUint64(buf []byte) uint64 {
   327  	return binary.BigEndian.Uint64(buf)
   328  }
   329  
   330  func writeUint64(buf []byte, u uint64) {
   331  	binary.BigEndian.PutUint64(buf, u)
   332  }