github.com/grailbio/base@v0.0.11/logio/reader.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package logio 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 ) 12 13 // ErrCorrupted is returned when log file corruption is detected. 14 var ErrCorrupted = errors.New("corrupted log file") 15 16 // Reader reads entries from a log file. 17 type Reader struct { 18 rd io.Reader 19 off int64 20 21 needResync bool 22 23 block block 24 } 25 26 // NewReader returns a log file reader that reads log entries from 27 // the provider io.Reader. The offset must be the current offset of 28 // the io.Reader into the IO stream from which records are read. 29 func NewReader(r io.Reader, offset int64) *Reader { 30 return &Reader{rd: r, off: offset} 31 } 32 33 // Read returns the next log entry. It returns ErrCorrupted if a 34 // corrupted log entry was encountered, in which case the next call 35 // to Read will re-sync the log file, potentially skipping entries. 36 // The returned slice should not be modified and is only valid until 37 // the next call to Read or Rewind. 38 func (r *Reader) Read() (data []byte, err error) { 39 if r.needResync { 40 if err := r.resync(); err != nil { 41 return nil, err 42 } 43 r.needResync = false 44 } 45 for first := true; ; first = false { 46 if r.block.eof() { 47 err := r.block.read(r.rd, &r.off) 48 if err == io.EOF && !first { 49 return nil, io.ErrUnexpectedEOF 50 } else if err != nil { 51 return nil, err 52 } 53 } 54 record, ok := r.block.next() 55 switch record.typ { 56 case recordFull, recordFirst: 57 ok = ok && first 58 case recordMiddle, recordLast: 59 ok = ok && !first 60 } 61 if !ok { 62 r.needResync = true 63 return nil, ErrCorrupted 64 } 65 switch record.typ { 66 case recordFull: 67 return record.data, nil 68 case recordFirst: 69 data = append([]byte{}, record.data...) 70 case recordMiddle: 71 data = append(data, record.data...) 72 case recordLast: 73 return append(data, record.data...), nil 74 } 75 } 76 } 77 78 // Reset resets the reader's state; subsequent entries are 79 // read from the provided reader at the provided offset. 80 func (r *Reader) Reset(rd io.Reader, offset int64) { 81 *r = Reader{rd: rd, off: offset} 82 } 83 84 func (r *Reader) resync() error { 85 for { 86 if err := r.block.read(r.rd, &r.off); err != nil { 87 return err 88 } 89 for { 90 record, ok := r.block.peek() 91 if !ok { 92 break 93 } 94 if record.typ == recordFirst || record.typ == recordFull { 95 return nil 96 } 97 r.block.next() 98 } 99 } 100 } 101 102 // Rewind finds and returns the offset of the last log entry in the 103 // log file represented by the reader r. The provided limit is the 104 // offset of the end of the log stream; thus Rewind may be used to 105 // traverse a log file in the backwards direction (error handling is 106 // left as an exercise to the reader): 107 // 108 // file, err := os.Open(...) 109 // info, err := file.Stat() 110 // off := info.Size() 111 // for { 112 // off, err = logio.Rewind(file, off) 113 // if err == io.EOF { 114 // break 115 // } 116 // file.Seek(off, io.SeekStart) 117 // record, err := logio.NewReader(file, off).Read() 118 // } 119 // 120 // Rewind returns io.EOF when no records can be located in the 121 // reader limited by the provided limit. 122 // 123 // If the passed reader is also an io.Seeker, then Rewind will seek 124 // to the returned offset. 125 func Rewind(r io.ReaderAt, limit int64) (off int64, err error) { 126 if s, ok := r.(io.Seeker); ok { 127 defer func() { 128 if err != nil { 129 return 130 } 131 off, err = s.Seek(off, io.SeekStart) 132 }() 133 } 134 135 if limit <= headersz { 136 return 0, io.EOF 137 } 138 off = limit - limit%Blocksz 139 // Special case: if the limit is on a block boundary, we begin by rewinding 140 // to the previous block. 141 if off == limit { 142 off -= Blocksz 143 } 144 for ; off >= 0; off -= Blocksz { 145 var b block 146 off -= off % Blocksz 147 if err = b.readLimit(r, off, limit); err != nil { 148 return 149 } 150 151 // Find the last valid record in the block. 152 var last record 153 for { 154 r, ok := b.next() 155 if !ok { 156 break 157 } 158 last = r 159 } 160 if last.isEmpty() { 161 // First record was invalid; try previous block. 162 continue 163 } 164 165 off += int64(last.blockOff) - int64(last.offset) 166 err = b.readLimit(r, off, limit) 167 if err != nil { 168 return 169 } 170 if r, ok := b.next(); ok && r.offset == 0 { 171 return 172 } 173 } 174 err = io.EOF 175 return 176 } 177 178 type record struct { 179 blockOff int 180 181 typ uint8 182 offset uint64 183 data []byte 184 } 185 186 func (r record) String() string { 187 return fmt.Sprintf("record blockOff:%d typ:%d offset:%d data:%d", r.blockOff, r.typ, r.offset, len(r.data)) 188 } 189 190 func (r record) isEmpty() bool { 191 return r.blockOff == 0 && r.typ == 0 && r.offset == 0 && r.data == nil 192 } 193 194 type block struct { 195 buf [Blocksz]byte 196 off, limit int 197 parsed record 198 ok bool 199 } 200 201 func (b *block) String() string { 202 return fmt.Sprintf("block off:%d limit:%d", b.off, b.limit) 203 } 204 205 func (b *block) eof() bool { 206 return b.off >= b.limit-headersz && b.parsed.isEmpty() 207 } 208 209 func (b *block) next() (record, bool) { 210 rec, ok := b.peek() 211 b.parsed = record{} 212 return rec, ok 213 } 214 215 func (b *block) peek() (record, bool) { 216 if b.parsed.isEmpty() { 217 b.parsed, b.ok = b.parse() 218 } 219 return b.parsed, b.ok 220 } 221 222 func (b *block) parse() (record, bool) { 223 if b.off >= b.limit-headersz { 224 return record{}, false 225 } 226 var r record 227 r.blockOff = b.off 228 chk := b.uint32() 229 r.typ = b.uint8() 230 length := b.uint16() 231 r.offset = b.uint64() 232 if int(length) > b.limit-b.off || checksum(b.buf[r.blockOff+4:r.blockOff+headersz+int(length)]) != chk { 233 return record{}, false 234 } 235 r.data = b.bytes(int(length)) 236 var ok bool 237 switch r.typ { 238 case recordFirst, recordFull: 239 ok = r.offset == 0 240 default: 241 ok = r.offset != 0 242 } 243 return r, ok 244 } 245 246 func (b *block) read(r io.Reader, off *int64) error { 247 b.reset(Blocksz - int(*off%Blocksz)) 248 n, err := io.ReadFull(r, b.buf[:b.limit]) 249 if err == io.ErrUnexpectedEOF { 250 b.limit = n 251 err = nil 252 } 253 *off += int64(n) 254 return err 255 } 256 257 func (b *block) readLimit(r io.ReaderAt, off, limit int64) error { 258 b.reset(Blocksz - int(off%Blocksz)) 259 if n := limit - off; n < int64(b.limit) { 260 b.limit = int(n) 261 } 262 if b.limit > len(b.buf) { 263 panic(off) 264 } 265 n, err := r.ReadAt(b.buf[:b.limit], off) 266 if err == io.EOF && n == b.limit && n < Blocksz { 267 err = nil 268 } 269 return err 270 } 271 272 func (b *block) reset(limit int) { 273 b.parsed = record{} 274 b.off = 0 275 b.limit = limit 276 } 277 278 func (b *block) uint8() uint8 { 279 v := b.buf[b.off] 280 b.off++ 281 return uint8(v) 282 } 283 284 func (b *block) uint16() uint16 { 285 v := byteOrder.Uint16(b.buf[b.off:]) 286 b.off += 2 287 return v 288 } 289 290 func (b *block) uint32() uint32 { 291 v := byteOrder.Uint32(b.buf[b.off:]) 292 b.off += 4 293 return v 294 } 295 296 func (b *block) uint64() uint64 { 297 v := byteOrder.Uint64(b.buf[b.off:]) 298 b.off += 8 299 return v 300 } 301 302 func (b *block) bytes(n int) []byte { 303 p := b.buf[b.off : b.off+n] 304 b.off += n 305 return p 306 }