github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/record/record.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package record reads and writes sequences of records. Each record is a stream 6 // of bytes that completes before the next record starts. 7 // 8 // When reading, call Next to obtain an io.Reader for the next record. Next will 9 // return io.EOF when there are no more records. It is valid to call Next 10 // without reading the current record to exhaustion. 11 // 12 // When writing, call Next to obtain an io.Writer for the next record. Calling 13 // Next finishes the current record. Call Close to finish the final record. 14 // 15 // Optionally, call Flush to finish the current record and flush the underlying 16 // writer without starting a new record. To start a new record after flushing, 17 // call Next. 18 // 19 // Neither Readers or Writers are safe to use concurrently. 20 // 21 // Example code: 22 // func read(r io.Reader) ([]string, error) { 23 // var ss []string 24 // records := record.NewReader(r) 25 // for { 26 // rec, err := records.Next() 27 // if err == io.EOF { 28 // break 29 // } 30 // if err != nil { 31 // log.Printf("recovering from %v", err) 32 // r.Recover() 33 // continue 34 // } 35 // s, err := ioutil.ReadAll(rec) 36 // if err != nil { 37 // log.Printf("recovering from %v", err) 38 // r.Recover() 39 // continue 40 // } 41 // ss = append(ss, string(s)) 42 // } 43 // return ss, nil 44 // } 45 // 46 // func write(w io.Writer, ss []string) error { 47 // records := record.NewWriter(w) 48 // for _, s := range ss { 49 // rec, err := records.Next() 50 // if err != nil { 51 // return err 52 // } 53 // if _, err := rec.Write([]byte(s)), err != nil { 54 // return err 55 // } 56 // } 57 // return records.Close() 58 // } 59 // 60 // The wire format is that the stream is divided into 32KiB blocks, and each 61 // block contains a number of tightly packed chunks. Chunks cannot cross block 62 // boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a 63 // block must be zero. 64 // 65 // A record maps to one or more chunks. There are two chunk formats: legacy and 66 // recyclable. The legacy chunk format: 67 // 68 // +----------+-----------+-----------+--- ... ---+ 69 // | CRC (4B) | Size (2B) | Type (1B) | Payload | 70 // +----------+-----------+-----------+--- ... ---+ 71 // 72 // CRC is computed over the type and payload 73 // Size is the length of the payload in bytes 74 // Type is the chunk type 75 // 76 // There are four chunk types: whether the chunk is the full record, or the 77 // first, middle or last chunk of a multi-chunk record. A multi-chunk record 78 // has one first chunk, zero or more middle chunks, and one last chunk. 79 // 80 // The recyclyable chunk format is similar to the legacy format, but extends 81 // the chunk header with an additional log number field. This allows reuse 82 // (recycling) of log files which can provide significantly better performance 83 // when syncing frequently as it avoids needing to update the file 84 // metadata. Additionally, recycling log files is a prequisite for using direct 85 // IO with log writing. The recyclyable format is: 86 // 87 // +----------+-----------+-----------+----------------+--- ... ---+ 88 // | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | 89 // +----------+-----------+-----------+----------------+--- ... ---+ 90 // 91 // Recyclable chunks are distinguished from legacy chunks by the addition of 4 92 // extra "recyclable" chunk types that map directly to the legacy chunk types 93 // (i.e. full, first, middle, last). The CRC is computed over the type, log 94 // number, and payload. 95 // 96 // The wire format allows for limited recovery in the face of data corruption: 97 // on a format error (such as a checksum mismatch), the reader moves to the 98 // next block and looks for the next full or first chunk. 99 package record // import "github.com/petermattis/pebble/internal/record" 100 101 // The C++ Level-DB code calls this the log, but it has been renamed to record 102 // to avoid clashing with the standard log package, and because it is generally 103 // useful outside of logging. The C++ code also uses the term "physical record" 104 // instead of "chunk", but "chunk" is shorter and less confusing. 105 106 import ( 107 "encoding/binary" 108 "errors" 109 "io" 110 111 "github.com/petermattis/pebble/internal/crc" 112 ) 113 114 // These constants are part of the wire format and should not be changed. 115 const ( 116 fullChunkType = 1 117 firstChunkType = 2 118 middleChunkType = 3 119 lastChunkType = 4 120 121 recyclableFullChunkType = 5 122 recyclableFirstChunkType = 6 123 recyclableMiddleChunkType = 7 124 recyclableLastChunkType = 8 125 ) 126 127 const ( 128 blockSize = 32 * 1024 129 blockSizeMask = blockSize - 1 130 legacyHeaderSize = 7 131 recyclableHeaderSize = legacyHeaderSize + 4 132 ) 133 134 var ( 135 // ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker. 136 ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker") 137 138 // ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record. 139 ErrNoLastRecord = errors.New("pebble/record: no last record exists") 140 141 // ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This 142 // usually occurs due to log file preallocation. 143 ErrZeroedChunk = errors.New("pebble/record: zeroed chunk") 144 145 // ErrInvalidChunk is returned if a chunk is encountered with an invalid 146 // header, length, or checksum. This usually occurs when a log is recycled, 147 // but can also occur due to corruption. 148 ErrInvalidChunk = errors.New("pebble/record: invalid chunk") 149 ) 150 151 // Reader reads records from an underlying io.Reader. 152 type Reader struct { 153 // r is the underlying reader. 154 r io.Reader 155 // logNum is the low 32-bits of the log's file number. May be zero when used 156 // with log files that do not have a file number (e.g. the MANIFEST). 157 logNum uint32 158 // blockNum is the zero based block number currently held in buf. 159 blockNum int64 160 // seq is the sequence number of the current record. 161 seq int 162 // buf[begin:end] is the unread portion of the current chunk's payload. The 163 // low bound, begin, excludes the chunk header. 164 begin, end int 165 // n is the number of bytes of buf that are valid. Once reading has started, 166 // only the final block can have n < blockSize. 167 n int 168 // started is whether Next has been called at all. 169 started bool 170 // recovering is true when recovering from corruption. 171 recovering bool 172 // last is whether the current chunk is the last chunk of the record. 173 last bool 174 // err is any accumulated error. 175 err error 176 // buf is the buffer. 177 buf [blockSize]byte 178 } 179 180 // NewReader returns a new reader. If the file contains records encoded using 181 // the recyclable record format, then the log number in those records must 182 // match the specifed logNum. 183 func NewReader(r io.Reader, logNum uint64) *Reader { 184 return &Reader{ 185 r: r, 186 logNum: uint32(logNum), 187 blockNum: -1, 188 } 189 } 190 191 // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the 192 // next block into the buffer if necessary. 193 func (r *Reader) nextChunk(wantFirst bool) error { 194 for { 195 if r.end+legacyHeaderSize <= r.n { 196 checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) 197 length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) 198 chunkType := r.buf[r.end+6] 199 200 if checksum == 0 && length == 0 && chunkType == 0 { 201 if r.end+recyclableHeaderSize > r.n { 202 // Skip the rest of the block if the recyclable header size does not 203 // fit within it. 204 r.end = r.n 205 continue 206 } 207 if r.recovering { 208 // Skip the rest of the block, if it looks like it is all 209 // zeroes. This is common with WAL preallocation. 210 // 211 // Set r.err to be an error so r.recover actually recovers. 212 r.err = ErrZeroedChunk 213 r.recover() 214 continue 215 } 216 return ErrZeroedChunk 217 } 218 219 headerSize := legacyHeaderSize 220 if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType { 221 headerSize = recyclableHeaderSize 222 if r.end+headerSize > r.n { 223 return ErrInvalidChunk 224 } 225 226 logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) 227 if logNum != r.logNum { 228 // Treat a record from a previous instance of the log as EOF. 229 return io.EOF 230 } 231 232 chunkType -= (recyclableFullChunkType - 1) 233 } 234 235 r.begin = r.end + headerSize 236 r.end = r.begin + int(length) 237 if r.end > r.n { 238 if r.recovering { 239 r.recover() 240 continue 241 } 242 return ErrInvalidChunk 243 } 244 if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() { 245 if r.recovering { 246 r.recover() 247 continue 248 } 249 return ErrInvalidChunk 250 } 251 if wantFirst { 252 if chunkType != fullChunkType && chunkType != firstChunkType { 253 continue 254 } 255 } 256 r.last = chunkType == fullChunkType || chunkType == lastChunkType 257 r.recovering = false 258 return nil 259 } 260 if r.n < blockSize && r.started { 261 if r.end != r.n { 262 return io.ErrUnexpectedEOF 263 } 264 return io.EOF 265 } 266 n, err := io.ReadFull(r.r, r.buf[:]) 267 if err != nil && err != io.ErrUnexpectedEOF { 268 return err 269 } 270 r.begin, r.end, r.n = 0, 0, n 271 r.blockNum++ 272 } 273 } 274 275 // Next returns a reader for the next record. It returns io.EOF if there are no 276 // more records. The reader returned becomes stale after the next Next call, 277 // and should no longer be used. 278 func (r *Reader) Next() (io.Reader, error) { 279 r.seq++ 280 if r.err != nil { 281 return nil, r.err 282 } 283 r.begin = r.end 284 r.err = r.nextChunk(true) 285 if r.err != nil { 286 return nil, r.err 287 } 288 r.started = true 289 return singleReader{r, r.seq}, nil 290 } 291 292 // Offset returns the current offset within the file. If called immediately 293 // before a call to Next(), Offset() will return the record offset. 294 func (r *Reader) Offset() int64 { 295 if r.blockNum < 0 { 296 return 0 297 } 298 return int64(r.blockNum)*blockSize + int64(r.end) 299 } 300 301 // recover clears any errors read so far, so that calling Next will start 302 // reading from the next good 32KiB block. If there are no such blocks, Next 303 // will return io.EOF. recover also marks the current reader, the one most 304 // recently returned by Next, as stale. If recover is called without any 305 // prior error, then recover is a no-op. 306 func (r *Reader) recover() { 307 if r.err == nil { 308 return 309 } 310 r.recovering = true 311 r.err = nil 312 // Discard the rest of the current block. 313 r.begin, r.end, r.last = r.n, r.n, false 314 // Invalidate any outstanding singleReader. 315 r.seq++ 316 } 317 318 // seekRecord seeks in the underlying io.Reader such that calling r.Next 319 // returns the record whose first chunk header starts at the provided offset. 320 // Its behavior is undefined if the argument given is not such an offset, as 321 // the bytes at that offset may coincidentally appear to be a valid header. 322 // 323 // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement 324 // io.Seeker. 325 // 326 // seekRecord will fail and return an error if the Reader previously 327 // encountered an error, including io.EOF. Such errors can be cleared by 328 // calling Recover. Calling seekRecord after Recover will make calling Next 329 // return the record at the given offset, instead of the record at the next 330 // good 32KiB block as Recover normally would. Calling seekRecord before 331 // Recover has no effect on Recover's semantics other than changing the 332 // starting point for determining the next good 32KiB block. 333 // 334 // The offset is always relative to the start of the underlying io.Reader, so 335 // negative values will result in an error as per io.Seeker. 336 func (r *Reader) seekRecord(offset int64) error { 337 r.seq++ 338 if r.err != nil { 339 return r.err 340 } 341 342 s, ok := r.r.(io.Seeker) 343 if !ok { 344 return ErrNotAnIOSeeker 345 } 346 347 // Only seek to an exact block offset. 348 c := int(offset & blockSizeMask) 349 if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil { 350 return r.err 351 } 352 353 // Clear the state of the internal reader. 354 r.begin, r.end, r.n = 0, 0, 0 355 r.started, r.recovering, r.last = false, false, false 356 if r.err = r.nextChunk(false); r.err != nil { 357 return r.err 358 } 359 360 // Now skip to the offset requested within the block. A subsequent 361 // call to Next will return the block at the requested offset. 362 r.begin, r.end = c, c 363 364 return nil 365 } 366 367 type singleReader struct { 368 r *Reader 369 seq int 370 } 371 372 func (x singleReader) Read(p []byte) (int, error) { 373 r := x.r 374 if r.seq != x.seq { 375 return 0, errors.New("pebble/record: stale reader") 376 } 377 if r.err != nil { 378 return 0, r.err 379 } 380 for r.begin == r.end { 381 if r.last { 382 return 0, io.EOF 383 } 384 if r.err = r.nextChunk(false); r.err != nil { 385 return 0, r.err 386 } 387 } 388 n := copy(p, r.buf[r.begin:r.end]) 389 r.begin += n 390 return n, nil 391 } 392 393 // Writer writes records to an underlying io.Writer. 394 type Writer struct { 395 // w is the underlying writer. 396 w io.Writer 397 // seq is the sequence number of the current record. 398 seq int 399 // f is w as a flusher. 400 f flusher 401 // buf[i:j] is the bytes that will become the current chunk. 402 // The low bound, i, includes the chunk header. 403 i, j int 404 // buf[:written] has already been written to w. 405 // written is zero unless Flush has been called. 406 written int 407 // baseOffset is the base offset in w at which writing started. If 408 // w implements io.Seeker, it's relative to the start of w, 0 otherwise. 409 baseOffset int64 410 // blockNumber is the zero based block number currently held in buf. 411 blockNumber int64 412 // lastRecordOffset is the offset in w where the last record was 413 // written (including the chunk header). It is a relative offset to 414 // baseOffset, thus the absolute offset of the last record is 415 // baseOffset + lastRecordOffset. 416 lastRecordOffset int64 417 // first is whether the current chunk is the first chunk of the record. 418 first bool 419 // pending is whether a chunk is buffered but not yet written. 420 pending bool 421 // err is any accumulated error. 422 err error 423 // buf is the buffer. 424 buf [blockSize]byte 425 } 426 427 // NewWriter returns a new Writer. 428 func NewWriter(w io.Writer) *Writer { 429 f, _ := w.(flusher) 430 431 var o int64 432 if s, ok := w.(io.Seeker); ok { 433 var err error 434 if o, err = s.Seek(0, io.SeekCurrent); err != nil { 435 o = 0 436 } 437 } 438 return &Writer{ 439 w: w, 440 f: f, 441 baseOffset: o, 442 lastRecordOffset: -1, 443 } 444 } 445 446 // fillHeader fills in the header for the pending chunk. 447 func (w *Writer) fillHeader(last bool) { 448 if w.i+legacyHeaderSize > w.j || w.j > blockSize { 449 panic("pebble/record: bad writer state") 450 } 451 if last { 452 if w.first { 453 w.buf[w.i+6] = fullChunkType 454 } else { 455 w.buf[w.i+6] = lastChunkType 456 } 457 } else { 458 if w.first { 459 w.buf[w.i+6] = firstChunkType 460 } else { 461 w.buf[w.i+6] = middleChunkType 462 } 463 } 464 binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value()) 465 binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize)) 466 } 467 468 // writeBlock writes the buffered block to the underlying writer, and reserves 469 // space for the next chunk's header. 470 func (w *Writer) writeBlock() { 471 _, w.err = w.w.Write(w.buf[w.written:]) 472 w.i = 0 473 w.j = legacyHeaderSize 474 w.written = 0 475 w.blockNumber++ 476 } 477 478 // writePending finishes the current record and writes the buffer to the 479 // underlying writer. 480 func (w *Writer) writePending() { 481 if w.err != nil { 482 return 483 } 484 if w.pending { 485 w.fillHeader(true) 486 w.pending = false 487 } 488 _, w.err = w.w.Write(w.buf[w.written:w.j]) 489 w.written = w.j 490 } 491 492 // Close finishes the current record and closes the writer. 493 func (w *Writer) Close() error { 494 w.seq++ 495 w.writePending() 496 if w.err != nil { 497 return w.err 498 } 499 w.err = errors.New("pebble/record: closed Writer") 500 return nil 501 } 502 503 // Flush finishes the current record, writes to the underlying writer, and 504 // flushes it if that writer implements interface{ Flush() error }. 505 func (w *Writer) Flush() error { 506 w.seq++ 507 w.writePending() 508 if w.err != nil { 509 return w.err 510 } 511 if w.f != nil { 512 w.err = w.f.Flush() 513 return w.err 514 } 515 return nil 516 } 517 518 // Next returns a writer for the next record. The writer returned becomes stale 519 // after the next Close, Flush or Next call, and should no longer be used. 520 func (w *Writer) Next() (io.Writer, error) { 521 w.seq++ 522 if w.err != nil { 523 return nil, w.err 524 } 525 if w.pending { 526 w.fillHeader(true) 527 } 528 w.i = w.j 529 w.j = w.j + legacyHeaderSize 530 // Check if there is room in the block for the header. 531 if w.j > blockSize { 532 // Fill in the rest of the block with zeroes. 533 for k := w.i; k < blockSize; k++ { 534 w.buf[k] = 0 535 } 536 w.writeBlock() 537 if w.err != nil { 538 return nil, w.err 539 } 540 } 541 w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i) 542 w.first = true 543 w.pending = true 544 return singleWriter{w, w.seq}, nil 545 } 546 547 // WriteRecord writes a complete record. Returns the offset just past the end 548 // of the record. 549 func (w *Writer) WriteRecord(p []byte) (int64, error) { 550 if w.err != nil { 551 return -1, w.err 552 } 553 t, err := w.Next() 554 if err != nil { 555 return -1, err 556 } 557 if _, err := t.Write(p); err != nil { 558 return -1, err 559 } 560 w.writePending() 561 offset := w.blockNumber*blockSize + int64(w.j) 562 return offset, w.err 563 } 564 565 // Size returns the current size of the file. 566 func (w *Writer) Size() int64 { 567 return w.blockNumber*blockSize + int64(w.j) 568 } 569 570 // LastRecordOffset returns the offset in the underlying io.Writer of the last 571 // record so far - the one created by the most recent Next call. It is the 572 // offset of the first chunk header, suitable to pass to Reader.SeekRecord. 573 // 574 // If that io.Writer also implements io.Seeker, the return value is an absolute 575 // offset, in the sense of io.SeekStart, regardless of whether the io.Writer 576 // was initially at the zero position when passed to NewWriter. Otherwise, the 577 // return value is a relative offset, being the number of bytes written between 578 // the NewWriter call and any records written prior to the last record. 579 // 580 // If there is no last record, i.e. nothing was written, LastRecordOffset will 581 // return ErrNoLastRecord. 582 func (w *Writer) LastRecordOffset() (int64, error) { 583 if w.err != nil { 584 return 0, w.err 585 } 586 if w.lastRecordOffset < 0 { 587 return 0, ErrNoLastRecord 588 } 589 return w.lastRecordOffset, nil 590 } 591 592 type singleWriter struct { 593 w *Writer 594 seq int 595 } 596 597 func (x singleWriter) Write(p []byte) (int, error) { 598 w := x.w 599 if w.seq != x.seq { 600 return 0, errors.New("pebble/record: stale writer") 601 } 602 if w.err != nil { 603 return 0, w.err 604 } 605 n0 := len(p) 606 for len(p) > 0 { 607 // Write a block, if it is full. 608 if w.j == blockSize { 609 w.fillHeader(false) 610 w.writeBlock() 611 if w.err != nil { 612 return 0, w.err 613 } 614 w.first = false 615 } 616 // Copy bytes into the buffer. 617 n := copy(w.buf[w.j:], p) 618 w.j += n 619 p = p[n:] 620 } 621 return n0, nil 622 }