github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/record/record.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package record reads and writes sequences of records. Each record is a stream
     6  // of bytes that completes before the next record starts.
     7  //
     8  // When reading, call Next to obtain an io.Reader for the next record. Next will
     9  // return io.EOF when there are no more records. It is valid to call Next
    10  // without reading the current record to exhaustion.
    11  //
    12  // When writing, call Next to obtain an io.Writer for the next record. Calling
    13  // Next finishes the current record. Call Close to finish the final record.
    14  //
    15  // Optionally, call Flush to finish the current record and flush the underlying
    16  // writer without starting a new record. To start a new record after flushing,
    17  // call Next.
    18  //
    19  // Neither Readers or Writers are safe to use concurrently.
    20  //
    21  // Example code:
    22  //	func read(r io.Reader) ([]string, error) {
    23  //		var ss []string
    24  //		records := record.NewReader(r)
    25  //		for {
    26  //			rec, err := records.Next()
    27  //			if err == io.EOF {
    28  //				break
    29  //			}
    30  //			if err != nil {
    31  //				log.Printf("recovering from %v", err)
    32  //				r.Recover()
    33  //				continue
    34  //			}
    35  //			s, err := ioutil.ReadAll(rec)
    36  //			if err != nil {
    37  //				log.Printf("recovering from %v", err)
    38  //				r.Recover()
    39  //				continue
    40  //			}
    41  //			ss = append(ss, string(s))
    42  //		}
    43  //		return ss, nil
    44  //	}
    45  //
    46  //	func write(w io.Writer, ss []string) error {
    47  //		records := record.NewWriter(w)
    48  //		for _, s := range ss {
    49  //			rec, err := records.Next()
    50  //			if err != nil {
    51  //				return err
    52  //			}
    53  //			if _, err := rec.Write([]byte(s)), err != nil {
    54  //				return err
    55  //			}
    56  //		}
    57  //		return records.Close()
    58  //	}
    59  //
    60  // The wire format is that the stream is divided into 32KiB blocks, and each
    61  // block contains a number of tightly packed chunks. Chunks cannot cross block
    62  // boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a
    63  // block must be zero.
    64  //
    65  // A record maps to one or more chunks. There are two chunk formats: legacy and
    66  // recyclable. The legacy chunk format:
    67  //
    68  //   +----------+-----------+-----------+--- ... ---+
    69  //   | CRC (4B) | Size (2B) | Type (1B) | Payload   |
    70  //   +----------+-----------+-----------+--- ... ---+
    71  //
    72  // CRC is computed over the type and payload
    73  // Size is the length of the payload in bytes
    74  // Type is the chunk type
    75  //
    76  // There are four chunk types: whether the chunk is the full record, or the
    77  // first, middle or last chunk of a multi-chunk record. A multi-chunk record
    78  // has one first chunk, zero or more middle chunks, and one last chunk.
    79  //
    80  // The recyclyable chunk format is similar to the legacy format, but extends
    81  // the chunk header with an additional log number field. This allows reuse
    82  // (recycling) of log files which can provide significantly better performance
    83  // when syncing frequently as it avoids needing to update the file
    84  // metadata. Additionally, recycling log files is a prequisite for using direct
    85  // IO with log writing. The recyclyable format is:
    86  //
    87  //   +----------+-----------+-----------+----------------+--- ... ---+
    88  //   | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
    89  //   +----------+-----------+-----------+----------------+--- ... ---+
    90  //
    91  // Recyclable chunks are distinguished from legacy chunks by the addition of 4
    92  // extra "recyclable" chunk types that map directly to the legacy chunk types
    93  // (i.e. full, first, middle, last). The CRC is computed over the type, log
    94  // number, and payload.
    95  //
    96  // The wire format allows for limited recovery in the face of data corruption:
    97  // on a format error (such as a checksum mismatch), the reader moves to the
    98  // next block and looks for the next full or first chunk.
    99  package record // import "github.com/petermattis/pebble/internal/record"
   100  
   101  // The C++ Level-DB code calls this the log, but it has been renamed to record
   102  // to avoid clashing with the standard log package, and because it is generally
   103  // useful outside of logging. The C++ code also uses the term "physical record"
   104  // instead of "chunk", but "chunk" is shorter and less confusing.
   105  
   106  import (
   107  	"encoding/binary"
   108  	"errors"
   109  	"io"
   110  
   111  	"github.com/petermattis/pebble/internal/crc"
   112  )
   113  
   114  // These constants are part of the wire format and should not be changed.
   115  const (
   116  	fullChunkType   = 1
   117  	firstChunkType  = 2
   118  	middleChunkType = 3
   119  	lastChunkType   = 4
   120  
   121  	recyclableFullChunkType   = 5
   122  	recyclableFirstChunkType  = 6
   123  	recyclableMiddleChunkType = 7
   124  	recyclableLastChunkType   = 8
   125  )
   126  
   127  const (
   128  	blockSize            = 32 * 1024
   129  	blockSizeMask        = blockSize - 1
   130  	legacyHeaderSize     = 7
   131  	recyclableHeaderSize = legacyHeaderSize + 4
   132  )
   133  
   134  var (
   135  	// ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker.
   136  	ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker")
   137  
   138  	// ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record.
   139  	ErrNoLastRecord = errors.New("pebble/record: no last record exists")
   140  
   141  	// ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This
   142  	// usually occurs due to log file preallocation.
   143  	ErrZeroedChunk = errors.New("pebble/record: zeroed chunk")
   144  
   145  	// ErrInvalidChunk is returned if a chunk is encountered with an invalid
   146  	// header, length, or checksum. This usually occurs when a log is recycled,
   147  	// but can also occur due to corruption.
   148  	ErrInvalidChunk = errors.New("pebble/record: invalid chunk")
   149  )
   150  
   151  // Reader reads records from an underlying io.Reader.
   152  type Reader struct {
   153  	// r is the underlying reader.
   154  	r io.Reader
   155  	// logNum is the low 32-bits of the log's file number. May be zero when used
   156  	// with log files that do not have a file number (e.g. the MANIFEST).
   157  	logNum uint32
   158  	// blockNum is the zero based block number currently held in buf.
   159  	blockNum int64
   160  	// seq is the sequence number of the current record.
   161  	seq int
   162  	// buf[begin:end] is the unread portion of the current chunk's payload. The
   163  	// low bound, begin, excludes the chunk header.
   164  	begin, end int
   165  	// n is the number of bytes of buf that are valid. Once reading has started,
   166  	// only the final block can have n < blockSize.
   167  	n int
   168  	// started is whether Next has been called at all.
   169  	started bool
   170  	// recovering is true when recovering from corruption.
   171  	recovering bool
   172  	// last is whether the current chunk is the last chunk of the record.
   173  	last bool
   174  	// err is any accumulated error.
   175  	err error
   176  	// buf is the buffer.
   177  	buf [blockSize]byte
   178  }
   179  
   180  // NewReader returns a new reader. If the file contains records encoded using
   181  // the recyclable record format, then the log number in those records must
   182  // match the specifed logNum.
   183  func NewReader(r io.Reader, logNum uint64) *Reader {
   184  	return &Reader{
   185  		r:        r,
   186  		logNum:   uint32(logNum),
   187  		blockNum: -1,
   188  	}
   189  }
   190  
   191  // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the
   192  // next block into the buffer if necessary.
   193  func (r *Reader) nextChunk(wantFirst bool) error {
   194  	for {
   195  		if r.end+legacyHeaderSize <= r.n {
   196  			checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4])
   197  			length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6])
   198  			chunkType := r.buf[r.end+6]
   199  
   200  			if checksum == 0 && length == 0 && chunkType == 0 {
   201  				if r.end+recyclableHeaderSize > r.n {
   202  					// Skip the rest of the block if the recyclable header size does not
   203  					// fit within it.
   204  					r.end = r.n
   205  					continue
   206  				}
   207  				if r.recovering {
   208  					// Skip the rest of the block, if it looks like it is all
   209  					// zeroes. This is common with WAL preallocation.
   210  					//
   211  					// Set r.err to be an error so r.recover actually recovers.
   212  					r.err = ErrZeroedChunk
   213  					r.recover()
   214  					continue
   215  				}
   216  				return ErrZeroedChunk
   217  			}
   218  
   219  			headerSize := legacyHeaderSize
   220  			if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType {
   221  				headerSize = recyclableHeaderSize
   222  				if r.end+headerSize > r.n {
   223  					return ErrInvalidChunk
   224  				}
   225  
   226  				logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11])
   227  				if logNum != r.logNum {
   228  					// Treat a record from a previous instance of the log as EOF.
   229  					return io.EOF
   230  				}
   231  
   232  				chunkType -= (recyclableFullChunkType - 1)
   233  			}
   234  
   235  			r.begin = r.end + headerSize
   236  			r.end = r.begin + int(length)
   237  			if r.end > r.n {
   238  				if r.recovering {
   239  					r.recover()
   240  					continue
   241  				}
   242  				return ErrInvalidChunk
   243  			}
   244  			if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
   245  				if r.recovering {
   246  					r.recover()
   247  					continue
   248  				}
   249  				return ErrInvalidChunk
   250  			}
   251  			if wantFirst {
   252  				if chunkType != fullChunkType && chunkType != firstChunkType {
   253  					continue
   254  				}
   255  			}
   256  			r.last = chunkType == fullChunkType || chunkType == lastChunkType
   257  			r.recovering = false
   258  			return nil
   259  		}
   260  		if r.n < blockSize && r.started {
   261  			if r.end != r.n {
   262  				return io.ErrUnexpectedEOF
   263  			}
   264  			return io.EOF
   265  		}
   266  		n, err := io.ReadFull(r.r, r.buf[:])
   267  		if err != nil && err != io.ErrUnexpectedEOF {
   268  			return err
   269  		}
   270  		r.begin, r.end, r.n = 0, 0, n
   271  		r.blockNum++
   272  	}
   273  }
   274  
   275  // Next returns a reader for the next record. It returns io.EOF if there are no
   276  // more records. The reader returned becomes stale after the next Next call,
   277  // and should no longer be used.
   278  func (r *Reader) Next() (io.Reader, error) {
   279  	r.seq++
   280  	if r.err != nil {
   281  		return nil, r.err
   282  	}
   283  	r.begin = r.end
   284  	r.err = r.nextChunk(true)
   285  	if r.err != nil {
   286  		return nil, r.err
   287  	}
   288  	r.started = true
   289  	return singleReader{r, r.seq}, nil
   290  }
   291  
   292  // Offset returns the current offset within the file. If called immediately
   293  // before a call to Next(), Offset() will return the record offset.
   294  func (r *Reader) Offset() int64 {
   295  	if r.blockNum < 0 {
   296  		return 0
   297  	}
   298  	return int64(r.blockNum)*blockSize + int64(r.end)
   299  }
   300  
   301  // recover clears any errors read so far, so that calling Next will start
   302  // reading from the next good 32KiB block. If there are no such blocks, Next
   303  // will return io.EOF. recover also marks the current reader, the one most
   304  // recently returned by Next, as stale. If recover is called without any
   305  // prior error, then recover is a no-op.
   306  func (r *Reader) recover() {
   307  	if r.err == nil {
   308  		return
   309  	}
   310  	r.recovering = true
   311  	r.err = nil
   312  	// Discard the rest of the current block.
   313  	r.begin, r.end, r.last = r.n, r.n, false
   314  	// Invalidate any outstanding singleReader.
   315  	r.seq++
   316  }
   317  
   318  // seekRecord seeks in the underlying io.Reader such that calling r.Next
   319  // returns the record whose first chunk header starts at the provided offset.
   320  // Its behavior is undefined if the argument given is not such an offset, as
   321  // the bytes at that offset may coincidentally appear to be a valid header.
   322  //
   323  // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement
   324  // io.Seeker.
   325  //
   326  // seekRecord will fail and return an error if the Reader previously
   327  // encountered an error, including io.EOF. Such errors can be cleared by
   328  // calling Recover. Calling seekRecord after Recover will make calling Next
   329  // return the record at the given offset, instead of the record at the next
   330  // good 32KiB block as Recover normally would. Calling seekRecord before
   331  // Recover has no effect on Recover's semantics other than changing the
   332  // starting point for determining the next good 32KiB block.
   333  //
   334  // The offset is always relative to the start of the underlying io.Reader, so
   335  // negative values will result in an error as per io.Seeker.
   336  func (r *Reader) seekRecord(offset int64) error {
   337  	r.seq++
   338  	if r.err != nil {
   339  		return r.err
   340  	}
   341  
   342  	s, ok := r.r.(io.Seeker)
   343  	if !ok {
   344  		return ErrNotAnIOSeeker
   345  	}
   346  
   347  	// Only seek to an exact block offset.
   348  	c := int(offset & blockSizeMask)
   349  	if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil {
   350  		return r.err
   351  	}
   352  
   353  	// Clear the state of the internal reader.
   354  	r.begin, r.end, r.n = 0, 0, 0
   355  	r.started, r.recovering, r.last = false, false, false
   356  	if r.err = r.nextChunk(false); r.err != nil {
   357  		return r.err
   358  	}
   359  
   360  	// Now skip to the offset requested within the block. A subsequent
   361  	// call to Next will return the block at the requested offset.
   362  	r.begin, r.end = c, c
   363  
   364  	return nil
   365  }
   366  
   367  type singleReader struct {
   368  	r   *Reader
   369  	seq int
   370  }
   371  
   372  func (x singleReader) Read(p []byte) (int, error) {
   373  	r := x.r
   374  	if r.seq != x.seq {
   375  		return 0, errors.New("pebble/record: stale reader")
   376  	}
   377  	if r.err != nil {
   378  		return 0, r.err
   379  	}
   380  	for r.begin == r.end {
   381  		if r.last {
   382  			return 0, io.EOF
   383  		}
   384  		if r.err = r.nextChunk(false); r.err != nil {
   385  			return 0, r.err
   386  		}
   387  	}
   388  	n := copy(p, r.buf[r.begin:r.end])
   389  	r.begin += n
   390  	return n, nil
   391  }
   392  
   393  // Writer writes records to an underlying io.Writer.
   394  type Writer struct {
   395  	// w is the underlying writer.
   396  	w io.Writer
   397  	// seq is the sequence number of the current record.
   398  	seq int
   399  	// f is w as a flusher.
   400  	f flusher
   401  	// buf[i:j] is the bytes that will become the current chunk.
   402  	// The low bound, i, includes the chunk header.
   403  	i, j int
   404  	// buf[:written] has already been written to w.
   405  	// written is zero unless Flush has been called.
   406  	written int
   407  	// baseOffset is the base offset in w at which writing started. If
   408  	// w implements io.Seeker, it's relative to the start of w, 0 otherwise.
   409  	baseOffset int64
   410  	// blockNumber is the zero based block number currently held in buf.
   411  	blockNumber int64
   412  	// lastRecordOffset is the offset in w where the last record was
   413  	// written (including the chunk header). It is a relative offset to
   414  	// baseOffset, thus the absolute offset of the last record is
   415  	// baseOffset + lastRecordOffset.
   416  	lastRecordOffset int64
   417  	// first is whether the current chunk is the first chunk of the record.
   418  	first bool
   419  	// pending is whether a chunk is buffered but not yet written.
   420  	pending bool
   421  	// err is any accumulated error.
   422  	err error
   423  	// buf is the buffer.
   424  	buf [blockSize]byte
   425  }
   426  
   427  // NewWriter returns a new Writer.
   428  func NewWriter(w io.Writer) *Writer {
   429  	f, _ := w.(flusher)
   430  
   431  	var o int64
   432  	if s, ok := w.(io.Seeker); ok {
   433  		var err error
   434  		if o, err = s.Seek(0, io.SeekCurrent); err != nil {
   435  			o = 0
   436  		}
   437  	}
   438  	return &Writer{
   439  		w:                w,
   440  		f:                f,
   441  		baseOffset:       o,
   442  		lastRecordOffset: -1,
   443  	}
   444  }
   445  
   446  // fillHeader fills in the header for the pending chunk.
   447  func (w *Writer) fillHeader(last bool) {
   448  	if w.i+legacyHeaderSize > w.j || w.j > blockSize {
   449  		panic("pebble/record: bad writer state")
   450  	}
   451  	if last {
   452  		if w.first {
   453  			w.buf[w.i+6] = fullChunkType
   454  		} else {
   455  			w.buf[w.i+6] = lastChunkType
   456  		}
   457  	} else {
   458  		if w.first {
   459  			w.buf[w.i+6] = firstChunkType
   460  		} else {
   461  			w.buf[w.i+6] = middleChunkType
   462  		}
   463  	}
   464  	binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value())
   465  	binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize))
   466  }
   467  
   468  // writeBlock writes the buffered block to the underlying writer, and reserves
   469  // space for the next chunk's header.
   470  func (w *Writer) writeBlock() {
   471  	_, w.err = w.w.Write(w.buf[w.written:])
   472  	w.i = 0
   473  	w.j = legacyHeaderSize
   474  	w.written = 0
   475  	w.blockNumber++
   476  }
   477  
   478  // writePending finishes the current record and writes the buffer to the
   479  // underlying writer.
   480  func (w *Writer) writePending() {
   481  	if w.err != nil {
   482  		return
   483  	}
   484  	if w.pending {
   485  		w.fillHeader(true)
   486  		w.pending = false
   487  	}
   488  	_, w.err = w.w.Write(w.buf[w.written:w.j])
   489  	w.written = w.j
   490  }
   491  
   492  // Close finishes the current record and closes the writer.
   493  func (w *Writer) Close() error {
   494  	w.seq++
   495  	w.writePending()
   496  	if w.err != nil {
   497  		return w.err
   498  	}
   499  	w.err = errors.New("pebble/record: closed Writer")
   500  	return nil
   501  }
   502  
   503  // Flush finishes the current record, writes to the underlying writer, and
   504  // flushes it if that writer implements interface{ Flush() error }.
   505  func (w *Writer) Flush() error {
   506  	w.seq++
   507  	w.writePending()
   508  	if w.err != nil {
   509  		return w.err
   510  	}
   511  	if w.f != nil {
   512  		w.err = w.f.Flush()
   513  		return w.err
   514  	}
   515  	return nil
   516  }
   517  
   518  // Next returns a writer for the next record. The writer returned becomes stale
   519  // after the next Close, Flush or Next call, and should no longer be used.
   520  func (w *Writer) Next() (io.Writer, error) {
   521  	w.seq++
   522  	if w.err != nil {
   523  		return nil, w.err
   524  	}
   525  	if w.pending {
   526  		w.fillHeader(true)
   527  	}
   528  	w.i = w.j
   529  	w.j = w.j + legacyHeaderSize
   530  	// Check if there is room in the block for the header.
   531  	if w.j > blockSize {
   532  		// Fill in the rest of the block with zeroes.
   533  		for k := w.i; k < blockSize; k++ {
   534  			w.buf[k] = 0
   535  		}
   536  		w.writeBlock()
   537  		if w.err != nil {
   538  			return nil, w.err
   539  		}
   540  	}
   541  	w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i)
   542  	w.first = true
   543  	w.pending = true
   544  	return singleWriter{w, w.seq}, nil
   545  }
   546  
   547  // WriteRecord writes a complete record. Returns the offset just past the end
   548  // of the record.
   549  func (w *Writer) WriteRecord(p []byte) (int64, error) {
   550  	if w.err != nil {
   551  		return -1, w.err
   552  	}
   553  	t, err := w.Next()
   554  	if err != nil {
   555  		return -1, err
   556  	}
   557  	if _, err := t.Write(p); err != nil {
   558  		return -1, err
   559  	}
   560  	w.writePending()
   561  	offset := w.blockNumber*blockSize + int64(w.j)
   562  	return offset, w.err
   563  }
   564  
   565  // Size returns the current size of the file.
   566  func (w *Writer) Size() int64 {
   567  	return w.blockNumber*blockSize + int64(w.j)
   568  }
   569  
   570  // LastRecordOffset returns the offset in the underlying io.Writer of the last
   571  // record so far - the one created by the most recent Next call. It is the
   572  // offset of the first chunk header, suitable to pass to Reader.SeekRecord.
   573  //
   574  // If that io.Writer also implements io.Seeker, the return value is an absolute
   575  // offset, in the sense of io.SeekStart, regardless of whether the io.Writer
   576  // was initially at the zero position when passed to NewWriter. Otherwise, the
   577  // return value is a relative offset, being the number of bytes written between
   578  // the NewWriter call and any records written prior to the last record.
   579  //
   580  // If there is no last record, i.e. nothing was written, LastRecordOffset will
   581  // return ErrNoLastRecord.
   582  func (w *Writer) LastRecordOffset() (int64, error) {
   583  	if w.err != nil {
   584  		return 0, w.err
   585  	}
   586  	if w.lastRecordOffset < 0 {
   587  		return 0, ErrNoLastRecord
   588  	}
   589  	return w.lastRecordOffset, nil
   590  }
   591  
   592  type singleWriter struct {
   593  	w   *Writer
   594  	seq int
   595  }
   596  
   597  func (x singleWriter) Write(p []byte) (int, error) {
   598  	w := x.w
   599  	if w.seq != x.seq {
   600  		return 0, errors.New("pebble/record: stale writer")
   601  	}
   602  	if w.err != nil {
   603  		return 0, w.err
   604  	}
   605  	n0 := len(p)
   606  	for len(p) > 0 {
   607  		// Write a block, if it is full.
   608  		if w.j == blockSize {
   609  			w.fillHeader(false)
   610  			w.writeBlock()
   611  			if w.err != nil {
   612  				return 0, w.err
   613  			}
   614  			w.first = false
   615  		}
   616  		// Copy bytes into the buffer.
   617  		n := copy(w.buf[w.j:], p)
   618  		w.j += n
   619  		p = p[n:]
   620  	}
   621  	return n0, nil
   622  }