github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/record/record.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package record reads and writes sequences of records. Each record is a stream
     6  // of bytes that completes before the next record starts.
     7  //
     8  // When reading, call Next to obtain an io.Reader for the next record. Next will
     9  // return io.EOF when there are no more records. It is valid to call Next
    10  // without reading the current record to exhaustion.
    11  //
    12  // When writing, call Next to obtain an io.Writer for the next record. Calling
    13  // Next finishes the current record. Call Close to finish the final record.
    14  //
    15  // Optionally, call Flush to finish the current record and flush the underlying
    16  // writer without starting a new record. To start a new record after flushing,
    17  // call Next.
    18  //
    19  // Neither Readers or Writers are safe to use concurrently.
    20  //
    21  // Example code:
    22  //
    23  //	func read(r io.Reader) ([]string, error) {
    24  //		var ss []string
    25  //		records := record.NewReader(r)
    26  //		for {
    27  //			rec, err := records.Next()
    28  //			if err == io.EOF {
    29  //				break
    30  //			}
    31  //			if err != nil {
    32  //				log.Printf("recovering from %v", err)
    33  //				r.Recover()
    34  //				continue
    35  //			}
    36  //			s, err := io.ReadAll(rec)
    37  //			if err != nil {
    38  //				log.Printf("recovering from %v", err)
    39  //				r.Recover()
    40  //				continue
    41  //			}
    42  //			ss = append(ss, string(s))
    43  //		}
    44  //		return ss, nil
    45  //	}
    46  //
    47  //	func write(w io.Writer, ss []string) error {
    48  //		records := record.NewWriter(w)
    49  //		for _, s := range ss {
    50  //			rec, err := records.Next()
    51  //			if err != nil {
    52  //				return err
    53  //			}
    54  //			if _, err := rec.Write([]byte(s)), err != nil {
    55  //				return err
    56  //			}
    57  //		}
    58  //		return records.Close()
    59  //	}
    60  //
    61  // The wire format is that the stream is divided into 32KiB blocks, and each
    62  // block contains a number of tightly packed chunks. Chunks cannot cross block
    63  // boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a
    64  // block must be zero.
    65  //
    66  // A record maps to one or more chunks. There are two chunk formats: legacy and
    67  // recyclable. The legacy chunk format:
    68  //
    69  //	+----------+-----------+-----------+--- ... ---+
    70  //	| CRC (4B) | Size (2B) | Type (1B) | Payload   |
    71  //	+----------+-----------+-----------+--- ... ---+
    72  //
    73  // CRC is computed over the type and payload
    74  // Size is the length of the payload in bytes
    75  // Type is the chunk type
    76  //
    77  // There are four chunk types: whether the chunk is the full record, or the
    78  // first, middle or last chunk of a multi-chunk record. A multi-chunk record
    79  // has one first chunk, zero or more middle chunks, and one last chunk.
    80  //
    81  // The recyclyable chunk format is similar to the legacy format, but extends
    82  // the chunk header with an additional log number field. This allows reuse
    83  // (recycling) of log files which can provide significantly better performance
    84  // when syncing frequently as it avoids needing to update the file
    85  // metadata. Additionally, recycling log files is a prequisite for using direct
    86  // IO with log writing. The recyclyable format is:
    87  //
    88  //	+----------+-----------+-----------+----------------+--- ... ---+
    89  //	| CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
    90  //	+----------+-----------+-----------+----------------+--- ... ---+
    91  //
    92  // Recyclable chunks are distinguished from legacy chunks by the addition of 4
    93  // extra "recyclable" chunk types that map directly to the legacy chunk types
    94  // (i.e. full, first, middle, last). The CRC is computed over the type, log
    95  // number, and payload.
    96  //
    97  // The wire format allows for limited recovery in the face of data corruption:
    98  // on a format error (such as a checksum mismatch), the reader moves to the
    99  // next block and looks for the next full or first chunk.
   100  package record
   101  
   102  // The C++ Level-DB code calls this the log, but it has been renamed to record
   103  // to avoid clashing with the standard log package, and because it is generally
   104  // useful outside of logging. The C++ code also uses the term "physical record"
   105  // instead of "chunk", but "chunk" is shorter and less confusing.
   106  
   107  import (
   108  	"encoding/binary"
   109  	"io"
   110  
   111  	"github.com/cockroachdb/errors"
   112  	"github.com/cockroachdb/pebble/internal/base"
   113  	"github.com/cockroachdb/pebble/internal/crc"
   114  )
   115  
   116  // These constants are part of the wire format and should not be changed.
   117  const (
   118  	fullChunkType   = 1
   119  	firstChunkType  = 2
   120  	middleChunkType = 3
   121  	lastChunkType   = 4
   122  
   123  	recyclableFullChunkType   = 5
   124  	recyclableFirstChunkType  = 6
   125  	recyclableMiddleChunkType = 7
   126  	recyclableLastChunkType   = 8
   127  )
   128  
   129  const (
   130  	blockSize            = 32 * 1024
   131  	blockSizeMask        = blockSize - 1
   132  	legacyHeaderSize     = 7
   133  	recyclableHeaderSize = legacyHeaderSize + 4
   134  )
   135  
   136  var (
   137  	// ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker.
   138  	ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker")
   139  
   140  	// ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record.
   141  	ErrNoLastRecord = errors.New("pebble/record: no last record exists")
   142  
   143  	// ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This
   144  	// usually occurs due to log file preallocation.
   145  	ErrZeroedChunk = base.CorruptionErrorf("pebble/record: zeroed chunk")
   146  
   147  	// ErrInvalidChunk is returned if a chunk is encountered with an invalid
   148  	// header, length, or checksum. This usually occurs when a log is recycled,
   149  	// but can also occur due to corruption.
   150  	ErrInvalidChunk = base.CorruptionErrorf("pebble/record: invalid chunk")
   151  )
   152  
   153  // IsInvalidRecord returns true if the error matches one of the error types
   154  // returned for invalid records. These are treated in a way similar to io.EOF
   155  // in recovery code.
   156  func IsInvalidRecord(err error) bool {
   157  	return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF
   158  }
   159  
   160  // Reader reads records from an underlying io.Reader.
   161  type Reader struct {
   162  	// r is the underlying reader.
   163  	r io.Reader
   164  	// logNum is the low 32-bits of the log's file number. May be zero when used
   165  	// with log files that do not have a file number (e.g. the MANIFEST).
   166  	logNum uint32
   167  	// blockNum is the zero based block number currently held in buf.
   168  	blockNum int64
   169  	// seq is the sequence number of the current record.
   170  	seq int
   171  	// buf[begin:end] is the unread portion of the current chunk's payload. The
   172  	// low bound, begin, excludes the chunk header.
   173  	begin, end int
   174  	// n is the number of bytes of buf that are valid. Once reading has started,
   175  	// only the final block can have n < blockSize.
   176  	n int
   177  	// recovering is true when recovering from corruption.
   178  	recovering bool
   179  	// last is whether the current chunk is the last chunk of the record.
   180  	last bool
   181  	// err is any accumulated error.
   182  	err error
   183  	// buf is the buffer.
   184  	buf [blockSize]byte
   185  }
   186  
   187  // NewReader returns a new reader. If the file contains records encoded using
   188  // the recyclable record format, then the log number in those records must
   189  // match the specified logNum.
   190  func NewReader(r io.Reader, logNum base.DiskFileNum) *Reader {
   191  	return &Reader{
   192  		r:        r,
   193  		logNum:   uint32(logNum),
   194  		blockNum: -1,
   195  	}
   196  }
   197  
   198  // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the
   199  // next block into the buffer if necessary.
   200  func (r *Reader) nextChunk(wantFirst bool) error {
   201  	for {
   202  		if r.end+legacyHeaderSize <= r.n {
   203  			checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4])
   204  			length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6])
   205  			chunkType := r.buf[r.end+6]
   206  
   207  			if checksum == 0 && length == 0 && chunkType == 0 {
   208  				if r.end+recyclableHeaderSize > r.n {
   209  					// Skip the rest of the block if the recyclable header size does not
   210  					// fit within it.
   211  					r.end = r.n
   212  					continue
   213  				}
   214  				if r.recovering {
   215  					// Skip the rest of the block, if it looks like it is all
   216  					// zeroes. This is common with WAL preallocation.
   217  					//
   218  					// Set r.err to be an error so r.recover actually recovers.
   219  					r.err = ErrZeroedChunk
   220  					r.recover()
   221  					continue
   222  				}
   223  				return ErrZeroedChunk
   224  			}
   225  
   226  			headerSize := legacyHeaderSize
   227  			if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType {
   228  				headerSize = recyclableHeaderSize
   229  				if r.end+headerSize > r.n {
   230  					return ErrInvalidChunk
   231  				}
   232  
   233  				logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11])
   234  				if logNum != r.logNum {
   235  					if wantFirst {
   236  						// If we're looking for the first chunk of a record, we can treat a
   237  						// previous instance of the log as EOF.
   238  						return io.EOF
   239  					}
   240  					// Otherwise, treat this chunk as invalid in order to prevent reading
   241  					// of a partial record.
   242  					return ErrInvalidChunk
   243  				}
   244  
   245  				chunkType -= (recyclableFullChunkType - 1)
   246  			}
   247  
   248  			r.begin = r.end + headerSize
   249  			r.end = r.begin + int(length)
   250  			if r.end > r.n {
   251  				// The chunk straddles a 32KB boundary (or the end of file).
   252  				if r.recovering {
   253  					r.recover()
   254  					continue
   255  				}
   256  				return ErrInvalidChunk
   257  			}
   258  			if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
   259  				if r.recovering {
   260  					r.recover()
   261  					continue
   262  				}
   263  				return ErrInvalidChunk
   264  			}
   265  			if wantFirst {
   266  				if chunkType != fullChunkType && chunkType != firstChunkType {
   267  					continue
   268  				}
   269  			}
   270  			r.last = chunkType == fullChunkType || chunkType == lastChunkType
   271  			r.recovering = false
   272  			return nil
   273  		}
   274  		if r.n < blockSize && r.blockNum >= 0 {
   275  			if !wantFirst || r.end != r.n {
   276  				// This can happen if the previous instance of the log ended with a
   277  				// partial block at the same blockNum as the new log but extended
   278  				// beyond the partial block of the new log.
   279  				return ErrInvalidChunk
   280  			}
   281  			return io.EOF
   282  		}
   283  		n, err := io.ReadFull(r.r, r.buf[:])
   284  		if err != nil && err != io.ErrUnexpectedEOF {
   285  			if err == io.EOF && !wantFirst {
   286  				return io.ErrUnexpectedEOF
   287  			}
   288  			return err
   289  		}
   290  		r.begin, r.end, r.n = 0, 0, n
   291  		r.blockNum++
   292  	}
   293  }
   294  
   295  // Next returns a reader for the next record. It returns io.EOF if there are no
   296  // more records. The reader returned becomes stale after the next Next call,
   297  // and should no longer be used.
   298  func (r *Reader) Next() (io.Reader, error) {
   299  	r.seq++
   300  	if r.err != nil {
   301  		return nil, r.err
   302  	}
   303  	r.begin = r.end
   304  	r.err = r.nextChunk(true)
   305  	if r.err != nil {
   306  		return nil, r.err
   307  	}
   308  	return singleReader{r, r.seq}, nil
   309  }
   310  
   311  // Offset returns the current offset within the file. If called immediately
   312  // before a call to Next(), Offset() will return the record offset.
   313  func (r *Reader) Offset() int64 {
   314  	if r.blockNum < 0 {
   315  		return 0
   316  	}
   317  	return int64(r.blockNum)*blockSize + int64(r.end)
   318  }
   319  
   320  // recover clears any errors read so far, so that calling Next will start
   321  // reading from the next good 32KiB block. If there are no such blocks, Next
   322  // will return io.EOF. recover also marks the current reader, the one most
   323  // recently returned by Next, as stale. If recover is called without any
   324  // prior error, then recover is a no-op.
   325  func (r *Reader) recover() {
   326  	if r.err == nil {
   327  		return
   328  	}
   329  	r.recovering = true
   330  	r.err = nil
   331  	// Discard the rest of the current block.
   332  	r.begin, r.end, r.last = r.n, r.n, false
   333  	// Invalidate any outstanding singleReader.
   334  	r.seq++
   335  }
   336  
   337  // seekRecord seeks in the underlying io.Reader such that calling r.Next
   338  // returns the record whose first chunk header starts at the provided offset.
   339  // Its behavior is undefined if the argument given is not such an offset, as
   340  // the bytes at that offset may coincidentally appear to be a valid header.
   341  //
   342  // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement
   343  // io.Seeker.
   344  //
   345  // seekRecord will fail and return an error if the Reader previously
   346  // encountered an error, including io.EOF. Such errors can be cleared by
   347  // calling Recover. Calling seekRecord after Recover will make calling Next
   348  // return the record at the given offset, instead of the record at the next
   349  // good 32KiB block as Recover normally would. Calling seekRecord before
   350  // Recover has no effect on Recover's semantics other than changing the
   351  // starting point for determining the next good 32KiB block.
   352  //
   353  // The offset is always relative to the start of the underlying io.Reader, so
   354  // negative values will result in an error as per io.Seeker.
   355  func (r *Reader) seekRecord(offset int64) error {
   356  	r.seq++
   357  	if r.err != nil {
   358  		return r.err
   359  	}
   360  
   361  	s, ok := r.r.(io.Seeker)
   362  	if !ok {
   363  		return ErrNotAnIOSeeker
   364  	}
   365  
   366  	// Only seek to an exact block offset.
   367  	c := int(offset & blockSizeMask)
   368  	if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil {
   369  		return r.err
   370  	}
   371  
   372  	// Clear the state of the internal reader.
   373  	r.begin, r.end, r.n = 0, 0, 0
   374  	r.blockNum, r.recovering, r.last = -1, false, false
   375  	if r.err = r.nextChunk(false); r.err != nil {
   376  		return r.err
   377  	}
   378  
   379  	// Now skip to the offset requested within the block. A subsequent
   380  	// call to Next will return the block at the requested offset.
   381  	r.begin, r.end = c, c
   382  
   383  	return nil
   384  }
   385  
   386  type singleReader struct {
   387  	r   *Reader
   388  	seq int
   389  }
   390  
   391  func (x singleReader) Read(p []byte) (int, error) {
   392  	r := x.r
   393  	if r.seq != x.seq {
   394  		return 0, errors.New("pebble/record: stale reader")
   395  	}
   396  	if r.err != nil {
   397  		return 0, r.err
   398  	}
   399  	for r.begin == r.end {
   400  		if r.last {
   401  			return 0, io.EOF
   402  		}
   403  		if r.err = r.nextChunk(false); r.err != nil {
   404  			return 0, r.err
   405  		}
   406  	}
   407  	n := copy(p, r.buf[r.begin:r.end])
   408  	r.begin += n
   409  	return n, nil
   410  }
   411  
   412  // Writer writes records to an underlying io.Writer.
   413  type Writer struct {
   414  	// w is the underlying writer.
   415  	w io.Writer
   416  	// seq is the sequence number of the current record.
   417  	seq int
   418  	// f is w as a flusher.
   419  	f flusher
   420  	// buf[i:j] is the bytes that will become the current chunk.
   421  	// The low bound, i, includes the chunk header.
   422  	i, j int
   423  	// buf[:written] has already been written to w.
   424  	// written is zero unless Flush has been called.
   425  	written int
   426  	// baseOffset is the base offset in w at which writing started. If
   427  	// w implements io.Seeker, it's relative to the start of w, 0 otherwise.
   428  	baseOffset int64
   429  	// blockNumber is the zero based block number currently held in buf.
   430  	blockNumber int64
   431  	// lastRecordOffset is the offset in w where the last record was
   432  	// written (including the chunk header). It is a relative offset to
   433  	// baseOffset, thus the absolute offset of the last record is
   434  	// baseOffset + lastRecordOffset.
   435  	lastRecordOffset int64
   436  	// first is whether the current chunk is the first chunk of the record.
   437  	first bool
   438  	// pending is whether a chunk is buffered but not yet written.
   439  	pending bool
   440  	// err is any accumulated error.
   441  	err error
   442  	// buf is the buffer.
   443  	buf [blockSize]byte
   444  }
   445  
   446  // NewWriter returns a new Writer.
   447  func NewWriter(w io.Writer) *Writer {
   448  	f, _ := w.(flusher)
   449  
   450  	var o int64
   451  	if s, ok := w.(io.Seeker); ok {
   452  		var err error
   453  		if o, err = s.Seek(0, io.SeekCurrent); err != nil {
   454  			o = 0
   455  		}
   456  	}
   457  	return &Writer{
   458  		w:                w,
   459  		f:                f,
   460  		baseOffset:       o,
   461  		lastRecordOffset: -1,
   462  	}
   463  }
   464  
   465  // fillHeader fills in the header for the pending chunk.
   466  func (w *Writer) fillHeader(last bool) {
   467  	if w.i+legacyHeaderSize > w.j || w.j > blockSize {
   468  		panic("pebble/record: bad writer state")
   469  	}
   470  	if last {
   471  		if w.first {
   472  			w.buf[w.i+6] = fullChunkType
   473  		} else {
   474  			w.buf[w.i+6] = lastChunkType
   475  		}
   476  	} else {
   477  		if w.first {
   478  			w.buf[w.i+6] = firstChunkType
   479  		} else {
   480  			w.buf[w.i+6] = middleChunkType
   481  		}
   482  	}
   483  	binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value())
   484  	binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize))
   485  }
   486  
   487  // writeBlock writes the buffered block to the underlying writer, and reserves
   488  // space for the next chunk's header.
   489  func (w *Writer) writeBlock() {
   490  	_, w.err = w.w.Write(w.buf[w.written:])
   491  	w.i = 0
   492  	w.j = legacyHeaderSize
   493  	w.written = 0
   494  	w.blockNumber++
   495  }
   496  
   497  // writePending finishes the current record and writes the buffer to the
   498  // underlying writer.
   499  func (w *Writer) writePending() {
   500  	if w.err != nil {
   501  		return
   502  	}
   503  	if w.pending {
   504  		w.fillHeader(true)
   505  		w.pending = false
   506  	}
   507  	_, w.err = w.w.Write(w.buf[w.written:w.j])
   508  	w.written = w.j
   509  }
   510  
   511  // Close finishes the current record and closes the writer.
   512  func (w *Writer) Close() error {
   513  	w.seq++
   514  	w.writePending()
   515  	if w.err != nil {
   516  		return w.err
   517  	}
   518  	w.err = errors.New("pebble/record: closed Writer")
   519  	return nil
   520  }
   521  
   522  // Flush finishes the current record, writes to the underlying writer, and
   523  // flushes it if that writer implements interface{ Flush() error }.
   524  func (w *Writer) Flush() error {
   525  	w.seq++
   526  	w.writePending()
   527  	if w.err != nil {
   528  		return w.err
   529  	}
   530  	if w.f != nil {
   531  		w.err = w.f.Flush()
   532  		return w.err
   533  	}
   534  	return nil
   535  }
   536  
   537  // Next returns a writer for the next record. The writer returned becomes stale
   538  // after the next Close, Flush or Next call, and should no longer be used.
   539  func (w *Writer) Next() (io.Writer, error) {
   540  	w.seq++
   541  	if w.err != nil {
   542  		return nil, w.err
   543  	}
   544  	if w.pending {
   545  		w.fillHeader(true)
   546  	}
   547  	w.i = w.j
   548  	w.j = w.j + legacyHeaderSize
   549  	// Check if there is room in the block for the header.
   550  	if w.j > blockSize {
   551  		// Fill in the rest of the block with zeroes.
   552  		clear(w.buf[w.i:])
   553  		w.writeBlock()
   554  		if w.err != nil {
   555  			return nil, w.err
   556  		}
   557  	}
   558  	w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i)
   559  	w.first = true
   560  	w.pending = true
   561  	return singleWriter{w, w.seq}, nil
   562  }
   563  
   564  // WriteRecord writes a complete record. Returns the offset just past the end
   565  // of the record.
   566  func (w *Writer) WriteRecord(p []byte) (int64, error) {
   567  	if w.err != nil {
   568  		return -1, w.err
   569  	}
   570  	t, err := w.Next()
   571  	if err != nil {
   572  		return -1, err
   573  	}
   574  	if _, err := t.Write(p); err != nil {
   575  		return -1, err
   576  	}
   577  	w.writePending()
   578  	offset := w.blockNumber*blockSize + int64(w.j)
   579  	return offset, w.err
   580  }
   581  
   582  // Size returns the current size of the file.
   583  func (w *Writer) Size() int64 {
   584  	if w == nil {
   585  		return 0
   586  	}
   587  	return w.blockNumber*blockSize + int64(w.j)
   588  }
   589  
   590  // LastRecordOffset returns the offset in the underlying io.Writer of the last
   591  // record so far - the one created by the most recent Next call. It is the
   592  // offset of the first chunk header, suitable to pass to Reader.SeekRecord.
   593  //
   594  // If that io.Writer also implements io.Seeker, the return value is an absolute
   595  // offset, in the sense of io.SeekStart, regardless of whether the io.Writer
   596  // was initially at the zero position when passed to NewWriter. Otherwise, the
   597  // return value is a relative offset, being the number of bytes written between
   598  // the NewWriter call and any records written prior to the last record.
   599  //
   600  // If there is no last record, i.e. nothing was written, LastRecordOffset will
   601  // return ErrNoLastRecord.
   602  func (w *Writer) LastRecordOffset() (int64, error) {
   603  	if w.err != nil {
   604  		return 0, w.err
   605  	}
   606  	if w.lastRecordOffset < 0 {
   607  		return 0, ErrNoLastRecord
   608  	}
   609  	return w.lastRecordOffset, nil
   610  }
   611  
   612  type singleWriter struct {
   613  	w   *Writer
   614  	seq int
   615  }
   616  
   617  func (x singleWriter) Write(p []byte) (int, error) {
   618  	w := x.w
   619  	if w.seq != x.seq {
   620  		return 0, errors.New("pebble/record: stale writer")
   621  	}
   622  	if w.err != nil {
   623  		return 0, w.err
   624  	}
   625  	n0 := len(p)
   626  	for len(p) > 0 {
   627  		// Write a block, if it is full.
   628  		if w.j == blockSize {
   629  			w.fillHeader(false)
   630  			w.writeBlock()
   631  			if w.err != nil {
   632  				return 0, w.err
   633  			}
   634  			w.first = false
   635  		}
   636  		// Copy bytes into the buffer.
   637  		n := copy(w.buf[w.j:], p)
   638  		w.j += n
   639  		p = p[n:]
   640  	}
   641  	return n0, nil
   642  }