github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/record/record.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package record
    16  
    17  import (
    18  	"encoding/binary"
    19  	"io"
    20  
    21  	"github.com/zuoyebang/bitalosdb/internal/base"
    22  	"github.com/zuoyebang/bitalosdb/internal/crc"
    23  
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  const (
    28  	fullChunkType   = 1
    29  	firstChunkType  = 2
    30  	middleChunkType = 3
    31  	lastChunkType   = 4
    32  
    33  	recyclableFullChunkType   = 5
    34  	recyclableFirstChunkType  = 6
    35  	recyclableMiddleChunkType = 7
    36  	recyclableLastChunkType   = 8
    37  )
    38  
    39  const (
    40  	blockSize            = 32 * 1024
    41  	blockSizeMask        = blockSize - 1
    42  	legacyHeaderSize     = 7
    43  	recyclableHeaderSize = legacyHeaderSize + 4
    44  )
    45  
    46  var (
    47  	// ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker.
    48  	ErrNotAnIOSeeker = errors.New("bitalosdb/record: reader does not implement io.Seeker")
    49  
    50  	// ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record.
    51  	ErrNoLastRecord = errors.New("bitalosdb/record: no last record exists")
    52  
    53  	// ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This
    54  	// usually occurs due to log file preallocation.
    55  	ErrZeroedChunk = base.CorruptionErrorf("bitalosdb/record: zeroed chunk")
    56  
    57  	// ErrInvalidChunk is returned if a chunk is encountered with an invalid
    58  	// header, length, or checksum. This usually occurs when a log is recycled,
    59  	// but can also occur due to corruption.
    60  	ErrInvalidChunk = base.CorruptionErrorf("bitalosdb/record: invalid chunk")
    61  )
    62  
    63  // IsInvalidRecord returns true if the error matches one of the error types
    64  // returned for invalid records. These are treated in a way similar to io.EOF
    65  // in recovery code.
    66  func IsInvalidRecord(err error) bool {
    67  	return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF
    68  }
    69  
    70  // Reader reads records from an underlying io.Reader.
    71  type Reader struct {
    72  	// r is the underlying reader.
    73  	r io.Reader
    74  	// logNum is the low 32-bits of the log's file number. May be zero when used
    75  	// with log files that do not have a file number (e.g. the MANIFEST).
    76  	logNum uint32
    77  	// blockNum is the zero based block number currently held in buf.
    78  	blockNum int64
    79  	// seq is the sequence number of the current record.
    80  	seq int
    81  	// buf[begin:end] is the unread portion of the current chunk's payload. The
    82  	// low bound, begin, excludes the chunk header.
    83  	begin, end int
    84  	// n is the number of bytes of buf that are valid. Once reading has started,
    85  	// only the final block can have n < blockSize.
    86  	n int
    87  	// recovering is true when recovering from corruption.
    88  	recovering bool
    89  	// last is whether the current chunk is the last chunk of the record.
    90  	last bool
    91  	// err is any accumulated error.
    92  	err error
    93  	// buf is the buffer.
    94  	buf [blockSize]byte
    95  }
    96  
    97  // NewReader returns a new reader. If the file contains records encoded using
    98  // the recyclable record format, then the log number in those records must
    99  // match the specified logNum.
   100  func NewReader(r io.Reader, logNum base.FileNum) *Reader {
   101  	return &Reader{
   102  		r:        r,
   103  		logNum:   uint32(logNum),
   104  		blockNum: -1,
   105  	}
   106  }
   107  
   108  // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the
   109  // next block into the buffer if necessary.
   110  func (r *Reader) nextChunk(wantFirst bool) error {
   111  	for {
   112  		if r.end+legacyHeaderSize <= r.n {
   113  			checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4])
   114  			length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6])
   115  			chunkType := r.buf[r.end+6]
   116  
   117  			if checksum == 0 && length == 0 && chunkType == 0 {
   118  				if r.end+recyclableHeaderSize > r.n {
   119  					// Skip the rest of the block if the recyclable header size does not
   120  					// fit within it.
   121  					r.end = r.n
   122  					continue
   123  				}
   124  				if r.recovering {
   125  					// Skip the rest of the block, if it looks like it is all
   126  					// zeroes. This is common with WAL preallocation.
   127  					//
   128  					// Set r.err to be an error so r.recover actually recovers.
   129  					r.err = ErrZeroedChunk
   130  					r.recover()
   131  					continue
   132  				}
   133  				return ErrZeroedChunk
   134  			}
   135  
   136  			headerSize := legacyHeaderSize
   137  			if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType {
   138  				headerSize = recyclableHeaderSize
   139  				if r.end+headerSize > r.n {
   140  					return ErrInvalidChunk
   141  				}
   142  
   143  				logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11])
   144  				if logNum != r.logNum {
   145  					if wantFirst {
   146  						// If we're looking for the first chunk of a record, we can treat a
   147  						// previous instance of the log as EOF.
   148  						return io.EOF
   149  					}
   150  					// Otherwise, treat this chunk as invalid in order to prevent reading
   151  					// of a partial record.
   152  					return ErrInvalidChunk
   153  				}
   154  
   155  				chunkType -= recyclableFullChunkType - 1
   156  			}
   157  
   158  			r.begin = r.end + headerSize
   159  			r.end = r.begin + int(length)
   160  			if r.end > r.n {
   161  				if r.recovering {
   162  					r.recover()
   163  					continue
   164  				}
   165  				return ErrInvalidChunk
   166  			}
   167  			if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
   168  				if r.recovering {
   169  					r.recover()
   170  					continue
   171  				}
   172  				return ErrInvalidChunk
   173  			}
   174  			if wantFirst {
   175  				if chunkType != fullChunkType && chunkType != firstChunkType {
   176  					continue
   177  				}
   178  			}
   179  			r.last = chunkType == fullChunkType || chunkType == lastChunkType
   180  			r.recovering = false
   181  			return nil
   182  		}
   183  		if r.n < blockSize && r.blockNum >= 0 {
   184  			if !wantFirst || r.end != r.n {
   185  				// This can happen if the previous instance of the log ended with a
   186  				// partial block at the same blockNum as the new log but extended
   187  				// beyond the partial block of the new log.
   188  				return ErrInvalidChunk
   189  			}
   190  			return io.EOF
   191  		}
   192  		n, err := io.ReadFull(r.r, r.buf[:])
   193  		if err != nil && err != io.ErrUnexpectedEOF {
   194  			if err == io.EOF && !wantFirst {
   195  				return io.ErrUnexpectedEOF
   196  			}
   197  			return err
   198  		}
   199  		r.begin, r.end, r.n = 0, 0, n
   200  		r.blockNum++
   201  	}
   202  }
   203  
   204  // Next returns a reader for the next record. It returns io.EOF if there are no
   205  // more records. The reader returned becomes stale after the next Next call,
   206  // and should no longer be used.
   207  func (r *Reader) Next() (io.Reader, error) {
   208  	r.seq++
   209  	if r.err != nil {
   210  		return nil, r.err
   211  	}
   212  	r.begin = r.end
   213  	r.err = r.nextChunk(true)
   214  	if r.err != nil {
   215  		return nil, r.err
   216  	}
   217  	return singleReader{r, r.seq}, nil
   218  }
   219  
   220  // Offset returns the current offset within the file. If called immediately
   221  // before a call to Next(), Offset() will return the record offset.
   222  func (r *Reader) Offset() int64 {
   223  	if r.blockNum < 0 {
   224  		return 0
   225  	}
   226  	return int64(r.blockNum)*blockSize + int64(r.end)
   227  }
   228  
   229  // recover clears any errors read so far, so that calling Next will start
   230  // reading from the next good 32KiB block. If there are no such blocks, Next
   231  // will return io.EOF. recover also marks the current reader, the one most
   232  // recently returned by Next, as stale. If recover is called without any
   233  // prior error, then recover is a no-op.
   234  func (r *Reader) recover() {
   235  	if r.err == nil {
   236  		return
   237  	}
   238  	r.recovering = true
   239  	r.err = nil
   240  	// Discard the rest of the current block.
   241  	r.begin, r.end, r.last = r.n, r.n, false
   242  	// Invalidate any outstanding singleReader.
   243  	r.seq++
   244  }
   245  
   246  // seekRecord seeks in the underlying io.Reader such that calling r.Next
   247  // returns the record whose first chunk header starts at the provided offset.
   248  // Its behavior is undefined if the argument given is not such an offset, as
   249  // the bytes at that offset may coincidentally appear to be a valid header.
   250  //
   251  // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement
   252  // io.Seeker.
   253  //
   254  // seekRecord will fail and return an error if the Reader previously
   255  // encountered an error, including io.EOF. Such errors can be cleared by
   256  // calling Recover. Calling seekRecord after Recover will make calling Next
   257  // return the record at the given offset, instead of the record at the next
   258  // good 32KiB block as Recover normally would. Calling seekRecord before
   259  // Recover has no effect on Recover's semantics other than changing the
   260  // starting point for determining the next good 32KiB block.
   261  //
   262  // The offset is always relative to the start of the underlying io.Reader, so
   263  // negative values will result in an error as per io.Seeker.
   264  func (r *Reader) seekRecord(offset int64) error {
   265  	r.seq++
   266  	if r.err != nil {
   267  		return r.err
   268  	}
   269  
   270  	s, ok := r.r.(io.Seeker)
   271  	if !ok {
   272  		return ErrNotAnIOSeeker
   273  	}
   274  
   275  	// Only seek to an exact block offset.
   276  	c := int(offset & blockSizeMask)
   277  	if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil {
   278  		return r.err
   279  	}
   280  
   281  	// Clear the state of the internal reader.
   282  	r.begin, r.end, r.n = 0, 0, 0
   283  	r.blockNum, r.recovering, r.last = -1, false, false
   284  	if r.err = r.nextChunk(false); r.err != nil {
   285  		return r.err
   286  	}
   287  
   288  	// Now skip to the offset requested within the block. A subsequent
   289  	// call to Next will return the block at the requested offset.
   290  	r.begin, r.end = c, c
   291  
   292  	return nil
   293  }
   294  
   295  type singleReader struct {
   296  	r   *Reader
   297  	seq int
   298  }
   299  
   300  func (x singleReader) Read(p []byte) (int, error) {
   301  	r := x.r
   302  	if r.seq != x.seq {
   303  		return 0, errors.New("bitalosdb/record: stale reader")
   304  	}
   305  	if r.err != nil {
   306  		return 0, r.err
   307  	}
   308  	for r.begin == r.end {
   309  		if r.last {
   310  			return 0, io.EOF
   311  		}
   312  		if r.err = r.nextChunk(false); r.err != nil {
   313  			return 0, r.err
   314  		}
   315  	}
   316  	n := copy(p, r.buf[r.begin:r.end])
   317  	r.begin += n
   318  	return n, nil
   319  }
   320  
   321  // Writer writes records to an underlying io.Writer.
   322  type Writer struct {
   323  	// w is the underlying writer.
   324  	w io.Writer
   325  	// seq is the sequence number of the current record.
   326  	seq int
   327  	// f is w as a flusher.
   328  	f flusher
   329  	// buf[i:j] is the bytes that will become the current chunk.
   330  	// The low bound, i, includes the chunk header.
   331  	i, j int
   332  	// buf[:written] has already been written to w.
   333  	// written is zero unless Flush has been called.
   334  	written int
   335  	// baseOffset is the base offset in w at which writing started. If
   336  	// w implements io.Seeker, it's relative to the start of w, 0 otherwise.
   337  	baseOffset int64
   338  	// blockNumber is the zero based block number currently held in buf.
   339  	blockNumber int64
   340  	// lastRecordOffset is the offset in w where the last record was
   341  	// written (including the chunk header). It is a relative offset to
   342  	// baseOffset, thus the absolute offset of the last record is
   343  	// baseOffset + lastRecordOffset.
   344  	lastRecordOffset int64
   345  	// first is whether the current chunk is the first chunk of the record.
   346  	first bool
   347  	// pending is whether a chunk is buffered but not yet written.
   348  	pending bool
   349  	// err is any accumulated error.
   350  	err error
   351  	// buf is the buffer.
   352  	buf [blockSize]byte
   353  }
   354  
   355  // NewWriter returns a new Writer.
   356  func NewWriter(w io.Writer) *Writer {
   357  	f, _ := w.(flusher)
   358  
   359  	var o int64
   360  	if s, ok := w.(io.Seeker); ok {
   361  		var err error
   362  		if o, err = s.Seek(0, io.SeekCurrent); err != nil {
   363  			o = 0
   364  		}
   365  	}
   366  	return &Writer{
   367  		w:                w,
   368  		f:                f,
   369  		baseOffset:       o,
   370  		lastRecordOffset: -1,
   371  	}
   372  }
   373  
   374  // fillHeader fills in the header for the pending chunk.
   375  func (w *Writer) fillHeader(last bool) {
   376  	if w.i+legacyHeaderSize > w.j || w.j > blockSize {
   377  		panic("bitalosdb/record: bad writer state")
   378  	}
   379  	if last {
   380  		if w.first {
   381  			w.buf[w.i+6] = fullChunkType
   382  		} else {
   383  			w.buf[w.i+6] = lastChunkType
   384  		}
   385  	} else {
   386  		if w.first {
   387  			w.buf[w.i+6] = firstChunkType
   388  		} else {
   389  			w.buf[w.i+6] = middleChunkType
   390  		}
   391  	}
   392  	binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value())
   393  	binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize))
   394  }
   395  
   396  // writeBlock writes the buffered block to the underlying writer, and reserves
   397  // space for the next chunk's header.
   398  func (w *Writer) writeBlock() {
   399  	_, w.err = w.w.Write(w.buf[w.written:])
   400  	w.i = 0
   401  	w.j = legacyHeaderSize
   402  	w.written = 0
   403  	w.blockNumber++
   404  }
   405  
   406  // writePending finishes the current record and writes the buffer to the
   407  // underlying writer.
   408  func (w *Writer) writePending() {
   409  	if w.err != nil {
   410  		return
   411  	}
   412  	if w.pending {
   413  		w.fillHeader(true)
   414  		w.pending = false
   415  	}
   416  	_, w.err = w.w.Write(w.buf[w.written:w.j])
   417  	w.written = w.j
   418  }
   419  
   420  // Close finishes the current record and closes the writer.
   421  func (w *Writer) Close() error {
   422  	w.seq++
   423  	w.writePending()
   424  	if w.err != nil {
   425  		return w.err
   426  	}
   427  	w.err = errors.New("bitalosdb/record: closed Writer")
   428  	return nil
   429  }
   430  
   431  // Flush finishes the current record, writes to the underlying writer, and
   432  // flushes it if that writer implements interface{ Flush() error }.
   433  func (w *Writer) Flush() error {
   434  	w.seq++
   435  	w.writePending()
   436  	if w.err != nil {
   437  		return w.err
   438  	}
   439  	if w.f != nil {
   440  		w.err = w.f.Flush()
   441  		return w.err
   442  	}
   443  	return nil
   444  }
   445  
   446  // Next returns a writer for the next record. The writer returned becomes stale
   447  // after the next Close, Flush or Next call, and should no longer be used.
   448  func (w *Writer) Next() (io.Writer, error) {
   449  	w.seq++
   450  	if w.err != nil {
   451  		return nil, w.err
   452  	}
   453  	if w.pending {
   454  		w.fillHeader(true)
   455  	}
   456  	w.i = w.j
   457  	w.j = w.j + legacyHeaderSize
   458  	// Check if there is room in the block for the header.
   459  	if w.j > blockSize {
   460  		// Fill in the rest of the block with zeroes.
   461  		for k := w.i; k < blockSize; k++ {
   462  			w.buf[k] = 0
   463  		}
   464  		w.writeBlock()
   465  		if w.err != nil {
   466  			return nil, w.err
   467  		}
   468  	}
   469  	w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i)
   470  	w.first = true
   471  	w.pending = true
   472  	return singleWriter{w, w.seq}, nil
   473  }
   474  
   475  // WriteRecord writes a complete record. Returns the offset just past the end
   476  // of the record.
   477  func (w *Writer) WriteRecord(p []byte) (int64, error) {
   478  	if w.err != nil {
   479  		return -1, w.err
   480  	}
   481  	t, err := w.Next()
   482  	if err != nil {
   483  		return -1, err
   484  	}
   485  	if _, err := t.Write(p); err != nil {
   486  		return -1, err
   487  	}
   488  	w.writePending()
   489  	offset := w.blockNumber*blockSize + int64(w.j)
   490  	return offset, w.err
   491  }
   492  
   493  // Size returns the current size of the file.
   494  func (w *Writer) Size() int64 {
   495  	if w == nil {
   496  		return 0
   497  	}
   498  	return w.blockNumber*blockSize + int64(w.j)
   499  }
   500  
   501  // LastRecordOffset returns the offset in the underlying io.Writer of the last
   502  // record so far - the one created by the most recent Next call. It is the
   503  // offset of the first chunk header, suitable to pass to Reader.SeekRecord.
   504  //
   505  // If that io.Writer also implements io.Seeker, the return value is an absolute
   506  // offset, in the sense of io.SeekStart, regardless of whether the io.Writer
   507  // was initially at the zero position when passed to NewWriter. Otherwise, the
   508  // return value is a relative offset, being the number of bytes written between
   509  // the NewWriter call and any records written prior to the last record.
   510  //
   511  // If there is no last record, i.e. nothing was written, LastRecordOffset will
   512  // return ErrNoLastRecord.
   513  func (w *Writer) LastRecordOffset() (int64, error) {
   514  	if w.err != nil {
   515  		return 0, w.err
   516  	}
   517  	if w.lastRecordOffset < 0 {
   518  		return 0, ErrNoLastRecord
   519  	}
   520  	return w.lastRecordOffset, nil
   521  }
   522  
   523  type singleWriter struct {
   524  	w   *Writer
   525  	seq int
   526  }
   527  
   528  func (x singleWriter) Write(p []byte) (int, error) {
   529  	w := x.w
   530  	if w.seq != x.seq {
   531  		return 0, errors.New("bitalosdb/record: stale writer")
   532  	}
   533  	if w.err != nil {
   534  		return 0, w.err
   535  	}
   536  	n0 := len(p)
   537  	for len(p) > 0 {
   538  		// Write a block, if it is full.
   539  		if w.j == blockSize {
   540  			w.fillHeader(false)
   541  			w.writeBlock()
   542  			if w.err != nil {
   543  				return 0, w.err
   544  			}
   545  			w.first = false
   546  		}
   547  		// Copy bytes into the buffer.
   548  		n := copy(w.buf[w.j:], p)
   549  		w.j += n
   550  		p = p[n:]
   551  	}
   552  	return n0, nil
   553  }