github.com/grailbio/base@v0.0.11/logio/reader.go (about)

     1  // Copyright 2019 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package logio
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  )
    12  
    13  // ErrCorrupted is returned when log file corruption is detected.
    14  var ErrCorrupted = errors.New("corrupted log file")
    15  
    16  // Reader reads entries from a log file.
    17  type Reader struct {
    18  	rd  io.Reader
    19  	off int64
    20  
    21  	needResync bool
    22  
    23  	block block
    24  }
    25  
    26  // NewReader returns a log file reader that reads log entries from
    27  // the provider io.Reader. The offset must be the current offset of
    28  // the io.Reader into the IO stream from which records are read.
    29  func NewReader(r io.Reader, offset int64) *Reader {
    30  	return &Reader{rd: r, off: offset}
    31  }
    32  
    33  // Read returns the next log entry. It returns ErrCorrupted if a
    34  // corrupted log entry was encountered, in which case the next call
    35  // to Read will re-sync the log file, potentially skipping entries.
    36  // The returned slice should not be modified and is only valid until
    37  // the next call to Read or Rewind.
    38  func (r *Reader) Read() (data []byte, err error) {
    39  	if r.needResync {
    40  		if err := r.resync(); err != nil {
    41  			return nil, err
    42  		}
    43  		r.needResync = false
    44  	}
    45  	for first := true; ; first = false {
    46  		if r.block.eof() {
    47  			err := r.block.read(r.rd, &r.off)
    48  			if err == io.EOF && !first {
    49  				return nil, io.ErrUnexpectedEOF
    50  			} else if err != nil {
    51  				return nil, err
    52  			}
    53  		}
    54  		record, ok := r.block.next()
    55  		switch record.typ {
    56  		case recordFull, recordFirst:
    57  			ok = ok && first
    58  		case recordMiddle, recordLast:
    59  			ok = ok && !first
    60  		}
    61  		if !ok {
    62  			r.needResync = true
    63  			return nil, ErrCorrupted
    64  		}
    65  		switch record.typ {
    66  		case recordFull:
    67  			return record.data, nil
    68  		case recordFirst:
    69  			data = append([]byte{}, record.data...)
    70  		case recordMiddle:
    71  			data = append(data, record.data...)
    72  		case recordLast:
    73  			return append(data, record.data...), nil
    74  		}
    75  	}
    76  }
    77  
    78  // Reset resets the reader's state; subsequent entries are
    79  // read from the provided reader at the provided offset.
    80  func (r *Reader) Reset(rd io.Reader, offset int64) {
    81  	*r = Reader{rd: rd, off: offset}
    82  }
    83  
    84  func (r *Reader) resync() error {
    85  	for {
    86  		if err := r.block.read(r.rd, &r.off); err != nil {
    87  			return err
    88  		}
    89  		for {
    90  			record, ok := r.block.peek()
    91  			if !ok {
    92  				break
    93  			}
    94  			if record.typ == recordFirst || record.typ == recordFull {
    95  				return nil
    96  			}
    97  			r.block.next()
    98  		}
    99  	}
   100  }
   101  
   102  // Rewind finds and returns the offset of the last log entry in the
   103  // log file represented by the reader r. The provided limit is the
   104  // offset of the end of the log stream; thus Rewind may be used to
   105  // traverse a log file in the backwards direction (error handling is
   106  // left as an exercise to the reader):
   107  //
   108  //	file, err := os.Open(...)
   109  //	info, err := file.Stat()
   110  //	off := info.Size()
   111  //	for {
   112  //		off, err = logio.Rewind(file, off)
   113  //		if err == io.EOF {
   114  //			break
   115  //		}
   116  //		file.Seek(off, io.SeekStart)
   117  //		record, err := logio.NewReader(file, off).Read()
   118  // 	}
   119  //
   120  // Rewind returns io.EOF when no records can be located in the
   121  // reader limited by the provided limit.
   122  //
   123  // If the passed reader is also an io.Seeker, then Rewind will seek
   124  // to the returned offset.
   125  func Rewind(r io.ReaderAt, limit int64) (off int64, err error) {
   126  	if s, ok := r.(io.Seeker); ok {
   127  		defer func() {
   128  			if err != nil {
   129  				return
   130  			}
   131  			off, err = s.Seek(off, io.SeekStart)
   132  		}()
   133  	}
   134  
   135  	if limit <= headersz {
   136  		return 0, io.EOF
   137  	}
   138  	off = limit - limit%Blocksz
   139  	// Special case: if the limit is on a block boundary, we begin by rewinding
   140  	// to the previous block.
   141  	if off == limit {
   142  		off -= Blocksz
   143  	}
   144  	for ; off >= 0; off -= Blocksz {
   145  		var b block
   146  		off -= off % Blocksz
   147  		if err = b.readLimit(r, off, limit); err != nil {
   148  			return
   149  		}
   150  
   151  		// Find the last valid record in the block.
   152  		var last record
   153  		for {
   154  			r, ok := b.next()
   155  			if !ok {
   156  				break
   157  			}
   158  			last = r
   159  		}
   160  		if last.isEmpty() {
   161  			// First record was invalid; try previous block.
   162  			continue
   163  		}
   164  
   165  		off += int64(last.blockOff) - int64(last.offset)
   166  		err = b.readLimit(r, off, limit)
   167  		if err != nil {
   168  			return
   169  		}
   170  		if r, ok := b.next(); ok && r.offset == 0 {
   171  			return
   172  		}
   173  	}
   174  	err = io.EOF
   175  	return
   176  }
   177  
   178  type record struct {
   179  	blockOff int
   180  
   181  	typ    uint8
   182  	offset uint64
   183  	data   []byte
   184  }
   185  
   186  func (r record) String() string {
   187  	return fmt.Sprintf("record blockOff:%d typ:%d offset:%d data:%d", r.blockOff, r.typ, r.offset, len(r.data))
   188  }
   189  
   190  func (r record) isEmpty() bool {
   191  	return r.blockOff == 0 && r.typ == 0 && r.offset == 0 && r.data == nil
   192  }
   193  
   194  type block struct {
   195  	buf        [Blocksz]byte
   196  	off, limit int
   197  	parsed     record
   198  	ok         bool
   199  }
   200  
   201  func (b *block) String() string {
   202  	return fmt.Sprintf("block off:%d limit:%d", b.off, b.limit)
   203  }
   204  
   205  func (b *block) eof() bool {
   206  	return b.off >= b.limit-headersz && b.parsed.isEmpty()
   207  }
   208  
   209  func (b *block) next() (record, bool) {
   210  	rec, ok := b.peek()
   211  	b.parsed = record{}
   212  	return rec, ok
   213  }
   214  
   215  func (b *block) peek() (record, bool) {
   216  	if b.parsed.isEmpty() {
   217  		b.parsed, b.ok = b.parse()
   218  	}
   219  	return b.parsed, b.ok
   220  }
   221  
   222  func (b *block) parse() (record, bool) {
   223  	if b.off >= b.limit-headersz {
   224  		return record{}, false
   225  	}
   226  	var r record
   227  	r.blockOff = b.off
   228  	chk := b.uint32()
   229  	r.typ = b.uint8()
   230  	length := b.uint16()
   231  	r.offset = b.uint64()
   232  	if int(length) > b.limit-b.off || checksum(b.buf[r.blockOff+4:r.blockOff+headersz+int(length)]) != chk {
   233  		return record{}, false
   234  	}
   235  	r.data = b.bytes(int(length))
   236  	var ok bool
   237  	switch r.typ {
   238  	case recordFirst, recordFull:
   239  		ok = r.offset == 0
   240  	default:
   241  		ok = r.offset != 0
   242  	}
   243  	return r, ok
   244  }
   245  
   246  func (b *block) read(r io.Reader, off *int64) error {
   247  	b.reset(Blocksz - int(*off%Blocksz))
   248  	n, err := io.ReadFull(r, b.buf[:b.limit])
   249  	if err == io.ErrUnexpectedEOF {
   250  		b.limit = n
   251  		err = nil
   252  	}
   253  	*off += int64(n)
   254  	return err
   255  }
   256  
   257  func (b *block) readLimit(r io.ReaderAt, off, limit int64) error {
   258  	b.reset(Blocksz - int(off%Blocksz))
   259  	if n := limit - off; n < int64(b.limit) {
   260  		b.limit = int(n)
   261  	}
   262  	if b.limit > len(b.buf) {
   263  		panic(off)
   264  	}
   265  	n, err := r.ReadAt(b.buf[:b.limit], off)
   266  	if err == io.EOF && n == b.limit && n < Blocksz {
   267  		err = nil
   268  	}
   269  	return err
   270  }
   271  
   272  func (b *block) reset(limit int) {
   273  	b.parsed = record{}
   274  	b.off = 0
   275  	b.limit = limit
   276  }
   277  
   278  func (b *block) uint8() uint8 {
   279  	v := b.buf[b.off]
   280  	b.off++
   281  	return uint8(v)
   282  }
   283  
   284  func (b *block) uint16() uint16 {
   285  	v := byteOrder.Uint16(b.buf[b.off:])
   286  	b.off += 2
   287  	return v
   288  }
   289  
   290  func (b *block) uint32() uint32 {
   291  	v := byteOrder.Uint32(b.buf[b.off:])
   292  	b.off += 4
   293  	return v
   294  }
   295  
   296  func (b *block) uint64() uint64 {
   297  	v := byteOrder.Uint64(b.buf[b.off:])
   298  	b.off += 8
   299  	return v
   300  }
   301  
   302  func (b *block) bytes(n int) []byte {
   303  	p := b.buf[b.off : b.off+n]
   304  	b.off += n
   305  	return p
   306  }