github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/record/record.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/record/record.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package record reads and writes sequences of records. Each record is a stream
     6  // of bytes that completes before the next record starts.
     7  //
     8  // When reading, call Next to obtain an io.Reader for the next record. Next will
     9  // return io.EOF when there are no more records. It is valid to call Next
    10  // without reading the current record to exhaustion.
    11  //
    12  // When writing, call Next to obtain an io.Writer for the next record. Calling
    13  // Next finishes the current record. Call Close to finish the final record.
    14  //
    15  // Optionally, call Flush to finish the current record and flush the underlying
    16  // writer without starting a new record. To start a new record after flushing,
    17  // call Next.
    18  //
    19  // Neither Readers or Writers are safe to use concurrently.
    20  //
    21  // Example code:
    22  //
    23  //	func read(r io.Reader) ([]string, error) {
    24  //		var ss []string
    25  //		records := record.NewReader(r)
    26  //		for {
    27  //			rec, err := records.Next()
    28  //			if err == io.EOF {
    29  //				break
    30  //			}
    31  //			if err != nil {
    32  //				log.Printf("recovering from %v", err)
    33  //				r.Recover()
    34  //				continue
    35  //			}
    36  //			s, err := ioutil.ReadAll(rec)
    37  //			if err != nil {
    38  //				log.Printf("recovering from %v", err)
    39  //				r.Recover()
    40  //				continue
    41  //			}
    42  //			ss = append(ss, string(s))
    43  //		}
    44  //		return ss, nil
    45  //	}
    46  //
    47  //	func write(w io.Writer, ss []string) error {
    48  //		records := record.NewWriter(w)
    49  //		for _, s := range ss {
    50  //			rec, err := records.Next()
    51  //			if err != nil {
    52  //				return err
    53  //			}
    54  //			if _, err := rec.Write([]byte(s)), err != nil {
    55  //				return err
    56  //			}
    57  //		}
    58  //		return records.Close()
    59  //	}
    60  //
    61  // The wire format is that the stream is divided into 32KiB blocks, and each
    62  // block contains a number of tightly packed chunks. Chunks cannot cross block
    63  // boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a
    64  // block must be zero.
    65  //
    66  // A record maps to one or more chunks. There are two chunk formats: legacy and
    67  // recyclable. The legacy chunk format:
    68  //
    69  //	+----------+-----------+-----------+--- ... ---+
    70  //	| CRC (4B) | Size (2B) | Type (1B) | Payload   |
    71  //	+----------+-----------+-----------+--- ... ---+
    72  //
    73  // CRC is computed over the type and payload
    74  // Size is the length of the payload in bytes
    75  // Type is the chunk type
    76  //
    77  // There are four chunk types: whether the chunk is the full record, or the
    78  // first, middle or last chunk of a multi-chunk record. A multi-chunk record
    79  // has one first chunk, zero or more middle chunks, and one last chunk.
    80  //
    81  // The recyclyable chunk format is similar to the legacy format, but extends
    82  // the chunk header with an additional log number field. This allows reuse
    83  // (recycling) of log files which can provide significantly better performance
    84  // when syncing frequently as it avoids needing to update the file
    85  // metadata. Additionally, recycling log files is a prequisite for using direct
    86  // IO with log writing. The recyclyable format is:
    87  //
    88  //	+----------+-----------+-----------+----------------+--- ... ---+
    89  //	| CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
    90  //	+----------+-----------+-----------+----------------+--- ... ---+
    91  //
    92  // Recyclable chunks are distinguished from legacy chunks by the addition of 4
    93  // extra "recyclable" chunk types that map directly to the legacy chunk types
    94  // (i.e. full, first, middle, last). The CRC is computed over the type, log
    95  // number, and payload.
    96  //
    97  // The wire format allows for limited recovery in the face of data corruption:
    98  // on a format error (such as a checksum mismatch), the reader moves to the
    99  // next block and looks for the next full or first chunk.
   100  package record
   101  
   102  // The C++ Level-DB code calls this the log, but it has been renamed to record
   103  // to avoid clashing with the standard log package, and because it is generally
   104  // useful outside of logging. The C++ code also uses the term "physical record"
   105  // instead of "chunk", but "chunk" is shorter and less confusing.
   106  
   107  import (
   108  	"encoding/binary"
   109  	"io"
   110  
   111  	"github.com/cockroachdb/errors"
   112  	"github.com/zuoyebang/bitalostable/internal/base"
   113  	"github.com/zuoyebang/bitalostable/internal/crc"
   114  )
   115  
   116  // These constants are part of the wire format and should not be changed.
   117  const (
   118  	fullChunkType   = 1
   119  	firstChunkType  = 2
   120  	middleChunkType = 3
   121  	lastChunkType   = 4
   122  
   123  	recyclableFullChunkType   = 5
   124  	recyclableFirstChunkType  = 6
   125  	recyclableMiddleChunkType = 7
   126  	recyclableLastChunkType   = 8
   127  )
   128  
   129  const (
   130  	blockSize            = 32 * 1024
   131  	blockSizeMask        = blockSize - 1
   132  	legacyHeaderSize     = 7
   133  	recyclableHeaderSize = legacyHeaderSize + 4
   134  )
   135  
   136  var (
   137  	// ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker.
   138  	ErrNotAnIOSeeker = errors.New("bitalostable/record: reader does not implement io.Seeker")
   139  
   140  	// ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record.
   141  	ErrNoLastRecord = errors.New("bitalostable/record: no last record exists")
   142  
   143  	// ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This
   144  	// usually occurs due to log file preallocation.
   145  	ErrZeroedChunk = base.CorruptionErrorf("bitalostable/record: zeroed chunk")
   146  
   147  	// ErrInvalidChunk is returned if a chunk is encountered with an invalid
   148  	// header, length, or checksum. This usually occurs when a log is recycled,
   149  	// but can also occur due to corruption.
   150  	ErrInvalidChunk = base.CorruptionErrorf("bitalostable/record: invalid chunk")
   151  )
   152  
   153  // IsInvalidRecord returns true if the error matches one of the error types
   154  // returned for invalid records. These are treated in a way similar to io.EOF
   155  // in recovery code.
   156  func IsInvalidRecord(err error) bool {
   157  	return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF
   158  }
   159  
   160  // Reader reads records from an underlying io.Reader.
   161  type Reader struct {
   162  	// r is the underlying reader.
   163  	r io.Reader
   164  	// logNum is the low 32-bits of the log's file number. May be zero when used
   165  	// with log files that do not have a file number (e.g. the MANIFEST).
   166  	logNum uint32
   167  	// blockNum is the zero based block number currently held in buf.
   168  	blockNum int64
   169  	// seq is the sequence number of the current record.
   170  	seq int
   171  	// buf[begin:end] is the unread portion of the current chunk's payload. The
   172  	// low bound, begin, excludes the chunk header.
   173  	begin, end int
   174  	// n is the number of bytes of buf that are valid. Once reading has started,
   175  	// only the final block can have n < blockSize.
   176  	n int
   177  	// recovering is true when recovering from corruption.
   178  	recovering bool
   179  	// last is whether the current chunk is the last chunk of the record.
   180  	last bool
   181  	// err is any accumulated error.
   182  	err error
   183  	// buf is the buffer.
   184  	buf [blockSize]byte
   185  }
   186  
   187  // NewReader returns a new reader. If the file contains records encoded using
   188  // the recyclable record format, then the log number in those records must
   189  // match the specified logNum.
   190  func NewReader(r io.Reader, logNum base.FileNum) *Reader {
   191  	return &Reader{
   192  		r:        r,
   193  		logNum:   uint32(logNum),
   194  		blockNum: -1,
   195  	}
   196  }
   197  
   198  // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the
   199  // next block into the buffer if necessary.
   200  func (r *Reader) nextChunk(wantFirst bool) error {
   201  	for {
   202  		if r.end+legacyHeaderSize <= r.n {
   203  			checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4])
   204  			length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6])
   205  			chunkType := r.buf[r.end+6]
   206  
   207  			if checksum == 0 && length == 0 && chunkType == 0 {
   208  				if r.end+recyclableHeaderSize > r.n {
   209  					// Skip the rest of the block if the recyclable header size does not
   210  					// fit within it.
   211  					r.end = r.n
   212  					continue
   213  				}
   214  				if r.recovering {
   215  					// Skip the rest of the block, if it looks like it is all
   216  					// zeroes. This is common with WAL preallocation.
   217  					//
   218  					// Set r.err to be an error so r.recover actually recovers.
   219  					r.err = ErrZeroedChunk
   220  					r.recover()
   221  					continue
   222  				}
   223  				return ErrZeroedChunk
   224  			}
   225  
   226  			headerSize := legacyHeaderSize
   227  			if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType {
   228  				headerSize = recyclableHeaderSize
   229  				if r.end+headerSize > r.n {
   230  					return ErrInvalidChunk
   231  				}
   232  
   233  				logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11])
   234  				if logNum != r.logNum {
   235  					if wantFirst {
   236  						// If we're looking for the first chunk of a record, we can treat a
   237  						// previous instance of the log as EOF.
   238  						return io.EOF
   239  					}
   240  					// Otherwise, treat this chunk as invalid in order to prevent reading
   241  					// of a partial record.
   242  					return ErrInvalidChunk
   243  				}
   244  
   245  				chunkType -= (recyclableFullChunkType - 1)
   246  			}
   247  
   248  			r.begin = r.end + headerSize
   249  			r.end = r.begin + int(length)
   250  			if r.end > r.n {
   251  				if r.recovering {
   252  					r.recover()
   253  					continue
   254  				}
   255  				return ErrInvalidChunk
   256  			}
   257  			if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
   258  				if r.recovering {
   259  					r.recover()
   260  					continue
   261  				}
   262  				return ErrInvalidChunk
   263  			}
   264  			if wantFirst {
   265  				if chunkType != fullChunkType && chunkType != firstChunkType {
   266  					continue
   267  				}
   268  			}
   269  			r.last = chunkType == fullChunkType || chunkType == lastChunkType
   270  			r.recovering = false
   271  			return nil
   272  		}
   273  		if r.n < blockSize && r.blockNum >= 0 {
   274  			if !wantFirst || r.end != r.n {
   275  				// This can happen if the previous instance of the log ended with a
   276  				// partial block at the same blockNum as the new log but extended
   277  				// beyond the partial block of the new log.
   278  				return ErrInvalidChunk
   279  			}
   280  			return io.EOF
   281  		}
   282  		n, err := io.ReadFull(r.r, r.buf[:])
   283  		if err != nil && err != io.ErrUnexpectedEOF {
   284  			if err == io.EOF && !wantFirst {
   285  				return io.ErrUnexpectedEOF
   286  			}
   287  			return err
   288  		}
   289  		r.begin, r.end, r.n = 0, 0, n
   290  		r.blockNum++
   291  	}
   292  }
   293  
   294  // Next returns a reader for the next record. It returns io.EOF if there are no
   295  // more records. The reader returned becomes stale after the next Next call,
   296  // and should no longer be used.
   297  func (r *Reader) Next() (io.Reader, error) {
   298  	r.seq++
   299  	if r.err != nil {
   300  		return nil, r.err
   301  	}
   302  	r.begin = r.end
   303  	r.err = r.nextChunk(true)
   304  	if r.err != nil {
   305  		return nil, r.err
   306  	}
   307  	return singleReader{r, r.seq}, nil
   308  }
   309  
   310  // Offset returns the current offset within the file. If called immediately
   311  // before a call to Next(), Offset() will return the record offset.
   312  func (r *Reader) Offset() int64 {
   313  	if r.blockNum < 0 {
   314  		return 0
   315  	}
   316  	return int64(r.blockNum)*blockSize + int64(r.end)
   317  }
   318  
   319  // recover clears any errors read so far, so that calling Next will start
   320  // reading from the next good 32KiB block. If there are no such blocks, Next
   321  // will return io.EOF. recover also marks the current reader, the one most
   322  // recently returned by Next, as stale. If recover is called without any
   323  // prior error, then recover is a no-op.
   324  func (r *Reader) recover() {
   325  	if r.err == nil {
   326  		return
   327  	}
   328  	r.recovering = true
   329  	r.err = nil
   330  	// Discard the rest of the current block.
   331  	r.begin, r.end, r.last = r.n, r.n, false
   332  	// Invalidate any outstanding singleReader.
   333  	r.seq++
   334  }
   335  
   336  // seekRecord seeks in the underlying io.Reader such that calling r.Next
   337  // returns the record whose first chunk header starts at the provided offset.
   338  // Its behavior is undefined if the argument given is not such an offset, as
   339  // the bytes at that offset may coincidentally appear to be a valid header.
   340  //
   341  // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement
   342  // io.Seeker.
   343  //
   344  // seekRecord will fail and return an error if the Reader previously
   345  // encountered an error, including io.EOF. Such errors can be cleared by
   346  // calling Recover. Calling seekRecord after Recover will make calling Next
   347  // return the record at the given offset, instead of the record at the next
   348  // good 32KiB block as Recover normally would. Calling seekRecord before
   349  // Recover has no effect on Recover's semantics other than changing the
   350  // starting point for determining the next good 32KiB block.
   351  //
   352  // The offset is always relative to the start of the underlying io.Reader, so
   353  // negative values will result in an error as per io.Seeker.
   354  func (r *Reader) seekRecord(offset int64) error {
   355  	r.seq++
   356  	if r.err != nil {
   357  		return r.err
   358  	}
   359  
   360  	s, ok := r.r.(io.Seeker)
   361  	if !ok {
   362  		return ErrNotAnIOSeeker
   363  	}
   364  
   365  	// Only seek to an exact block offset.
   366  	c := int(offset & blockSizeMask)
   367  	if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil {
   368  		return r.err
   369  	}
   370  
   371  	// Clear the state of the internal reader.
   372  	r.begin, r.end, r.n = 0, 0, 0
   373  	r.blockNum, r.recovering, r.last = -1, false, false
   374  	if r.err = r.nextChunk(false); r.err != nil {
   375  		return r.err
   376  	}
   377  
   378  	// Now skip to the offset requested within the block. A subsequent
   379  	// call to Next will return the block at the requested offset.
   380  	r.begin, r.end = c, c
   381  
   382  	return nil
   383  }
   384  
   385  type singleReader struct {
   386  	r   *Reader
   387  	seq int
   388  }
   389  
   390  func (x singleReader) Read(p []byte) (int, error) {
   391  	r := x.r
   392  	if r.seq != x.seq {
   393  		return 0, errors.New("bitalostable/record: stale reader")
   394  	}
   395  	if r.err != nil {
   396  		return 0, r.err
   397  	}
   398  	for r.begin == r.end {
   399  		if r.last {
   400  			return 0, io.EOF
   401  		}
   402  		if r.err = r.nextChunk(false); r.err != nil {
   403  			return 0, r.err
   404  		}
   405  	}
   406  	n := copy(p, r.buf[r.begin:r.end])
   407  	r.begin += n
   408  	return n, nil
   409  }
   410  
   411  // Writer writes records to an underlying io.Writer.
   412  type Writer struct {
   413  	// w is the underlying writer.
   414  	w io.Writer
   415  	// seq is the sequence number of the current record.
   416  	seq int
   417  	// f is w as a flusher.
   418  	f flusher
   419  	// buf[i:j] is the bytes that will become the current chunk.
   420  	// The low bound, i, includes the chunk header.
   421  	i, j int
   422  	// buf[:written] has already been written to w.
   423  	// written is zero unless Flush has been called.
   424  	written int
   425  	// baseOffset is the base offset in w at which writing started. If
   426  	// w implements io.Seeker, it's relative to the start of w, 0 otherwise.
   427  	baseOffset int64
   428  	// blockNumber is the zero based block number currently held in buf.
   429  	blockNumber int64
   430  	// lastRecordOffset is the offset in w where the last record was
   431  	// written (including the chunk header). It is a relative offset to
   432  	// baseOffset, thus the absolute offset of the last record is
   433  	// baseOffset + lastRecordOffset.
   434  	lastRecordOffset int64
   435  	// first is whether the current chunk is the first chunk of the record.
   436  	first bool
   437  	// pending is whether a chunk is buffered but not yet written.
   438  	pending bool
   439  	// err is any accumulated error.
   440  	err error
   441  	// buf is the buffer.
   442  	buf [blockSize]byte
   443  }
   444  
   445  // NewWriter returns a new Writer.
   446  func NewWriter(w io.Writer) *Writer {
   447  	f, _ := w.(flusher)
   448  
   449  	var o int64
   450  	if s, ok := w.(io.Seeker); ok {
   451  		var err error
   452  		if o, err = s.Seek(0, io.SeekCurrent); err != nil {
   453  			o = 0
   454  		}
   455  	}
   456  	return &Writer{
   457  		w:                w,
   458  		f:                f,
   459  		baseOffset:       o,
   460  		lastRecordOffset: -1,
   461  	}
   462  }
   463  
   464  // fillHeader fills in the header for the pending chunk.
   465  func (w *Writer) fillHeader(last bool) {
   466  	if w.i+legacyHeaderSize > w.j || w.j > blockSize {
   467  		panic("bitalostable/record: bad writer state")
   468  	}
   469  	if last {
   470  		if w.first {
   471  			w.buf[w.i+6] = fullChunkType
   472  		} else {
   473  			w.buf[w.i+6] = lastChunkType
   474  		}
   475  	} else {
   476  		if w.first {
   477  			w.buf[w.i+6] = firstChunkType
   478  		} else {
   479  			w.buf[w.i+6] = middleChunkType
   480  		}
   481  	}
   482  	binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value())
   483  	binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize))
   484  }
   485  
   486  // writeBlock writes the buffered block to the underlying writer, and reserves
   487  // space for the next chunk's header.
   488  func (w *Writer) writeBlock() {
   489  	_, w.err = w.w.Write(w.buf[w.written:])
   490  	w.i = 0
   491  	w.j = legacyHeaderSize
   492  	w.written = 0
   493  	w.blockNumber++
   494  }
   495  
   496  // writePending finishes the current record and writes the buffer to the
   497  // underlying writer.
   498  func (w *Writer) writePending() {
   499  	if w.err != nil {
   500  		return
   501  	}
   502  	if w.pending {
   503  		w.fillHeader(true)
   504  		w.pending = false
   505  	}
   506  	_, w.err = w.w.Write(w.buf[w.written:w.j])
   507  	w.written = w.j
   508  }
   509  
   510  // Close finishes the current record and closes the writer.
   511  func (w *Writer) Close() error {
   512  	w.seq++
   513  	w.writePending()
   514  	if w.err != nil {
   515  		return w.err
   516  	}
   517  	w.err = errors.New("bitalostable/record: closed Writer")
   518  	return nil
   519  }
   520  
   521  // Flush finishes the current record, writes to the underlying writer, and
   522  // flushes it if that writer implements interface{ Flush() error }.
   523  func (w *Writer) Flush() error {
   524  	w.seq++
   525  	w.writePending()
   526  	if w.err != nil {
   527  		return w.err
   528  	}
   529  	if w.f != nil {
   530  		w.err = w.f.Flush()
   531  		return w.err
   532  	}
   533  	return nil
   534  }
   535  
   536  // Next returns a writer for the next record. The writer returned becomes stale
   537  // after the next Close, Flush or Next call, and should no longer be used.
   538  func (w *Writer) Next() (io.Writer, error) {
   539  	w.seq++
   540  	if w.err != nil {
   541  		return nil, w.err
   542  	}
   543  	if w.pending {
   544  		w.fillHeader(true)
   545  	}
   546  	w.i = w.j
   547  	w.j = w.j + legacyHeaderSize
   548  	// Check if there is room in the block for the header.
   549  	if w.j > blockSize {
   550  		// Fill in the rest of the block with zeroes.
   551  		for k := w.i; k < blockSize; k++ {
   552  			w.buf[k] = 0
   553  		}
   554  		w.writeBlock()
   555  		if w.err != nil {
   556  			return nil, w.err
   557  		}
   558  	}
   559  	w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i)
   560  	w.first = true
   561  	w.pending = true
   562  	return singleWriter{w, w.seq}, nil
   563  }
   564  
   565  // WriteRecord writes a complete record. Returns the offset just past the end
   566  // of the record.
   567  func (w *Writer) WriteRecord(p []byte) (int64, error) {
   568  	if w.err != nil {
   569  		return -1, w.err
   570  	}
   571  	t, err := w.Next()
   572  	if err != nil {
   573  		return -1, err
   574  	}
   575  	if _, err := t.Write(p); err != nil {
   576  		return -1, err
   577  	}
   578  	w.writePending()
   579  	offset := w.blockNumber*blockSize + int64(w.j)
   580  	return offset, w.err
   581  }
   582  
   583  // Size returns the current size of the file.
   584  func (w *Writer) Size() int64 {
   585  	if w == nil {
   586  		return 0
   587  	}
   588  	return w.blockNumber*blockSize + int64(w.j)
   589  }
   590  
   591  // LastRecordOffset returns the offset in the underlying io.Writer of the last
   592  // record so far - the one created by the most recent Next call. It is the
   593  // offset of the first chunk header, suitable to pass to Reader.SeekRecord.
   594  //
   595  // If that io.Writer also implements io.Seeker, the return value is an absolute
   596  // offset, in the sense of io.SeekStart, regardless of whether the io.Writer
   597  // was initially at the zero position when passed to NewWriter. Otherwise, the
   598  // return value is a relative offset, being the number of bytes written between
   599  // the NewWriter call and any records written prior to the last record.
   600  //
   601  // If there is no last record, i.e. nothing was written, LastRecordOffset will
   602  // return ErrNoLastRecord.
   603  func (w *Writer) LastRecordOffset() (int64, error) {
   604  	if w.err != nil {
   605  		return 0, w.err
   606  	}
   607  	if w.lastRecordOffset < 0 {
   608  		return 0, ErrNoLastRecord
   609  	}
   610  	return w.lastRecordOffset, nil
   611  }
   612  
   613  type singleWriter struct {
   614  	w   *Writer
   615  	seq int
   616  }
   617  
   618  func (x singleWriter) Write(p []byte) (int, error) {
   619  	w := x.w
   620  	if w.seq != x.seq {
   621  		return 0, errors.New("bitalostable/record: stale writer")
   622  	}
   623  	if w.err != nil {
   624  		return 0, w.err
   625  	}
   626  	n0 := len(p)
   627  	for len(p) > 0 {
   628  		// Write a block, if it is full.
   629  		if w.j == blockSize {
   630  			w.fillHeader(false)
   631  			w.writeBlock()
   632  			if w.err != nil {
   633  				return 0, w.err
   634  			}
   635  			w.first = false
   636  		}
   637  		// Copy bytes into the buffer.
   638  		n := copy(w.buf[w.j:], p)
   639  		w.j += n
   640  		p = p[n:]
   641  	}
   642  	return n0, nil
   643  }