github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/record/record.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package record reads and writes sequences of records. Each record is a stream 6 // of bytes that completes before the next record starts. 7 // 8 // When reading, call Next to obtain an io.Reader for the next record. Next will 9 // return io.EOF when there are no more records. It is valid to call Next 10 // without reading the current record to exhaustion. 11 // 12 // When writing, call Next to obtain an io.Writer for the next record. Calling 13 // Next finishes the current record. Call Close to finish the final record. 14 // 15 // Optionally, call Flush to finish the current record and flush the underlying 16 // writer without starting a new record. To start a new record after flushing, 17 // call Next. 18 // 19 // Neither Readers or Writers are safe to use concurrently. 20 // 21 // Example code: 22 // 23 // func read(r io.Reader) ([]string, error) { 24 // var ss []string 25 // records := record.NewReader(r) 26 // for { 27 // rec, err := records.Next() 28 // if err == io.EOF { 29 // break 30 // } 31 // if err != nil { 32 // log.Printf("recovering from %v", err) 33 // r.Recover() 34 // continue 35 // } 36 // s, err := ioutil.ReadAll(rec) 37 // if err != nil { 38 // log.Printf("recovering from %v", err) 39 // r.Recover() 40 // continue 41 // } 42 // ss = append(ss, string(s)) 43 // } 44 // return ss, nil 45 // } 46 // 47 // func write(w io.Writer, ss []string) error { 48 // records := record.NewWriter(w) 49 // for _, s := range ss { 50 // rec, err := records.Next() 51 // if err != nil { 52 // return err 53 // } 54 // if _, err := rec.Write([]byte(s)), err != nil { 55 // return err 56 // } 57 // } 58 // return records.Close() 59 // } 60 // 61 // The wire format is that the stream is divided into 32KiB blocks, and each 62 // block contains a number of tightly packed chunks. Chunks cannot cross block 63 // boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a 64 // block must be zero. 65 // 66 // A record maps to one or more chunks. There are two chunk formats: legacy and 67 // recyclable. The legacy chunk format: 68 // 69 // +----------+-----------+-----------+--- ... ---+ 70 // | CRC (4B) | Size (2B) | Type (1B) | Payload | 71 // +----------+-----------+-----------+--- ... ---+ 72 // 73 // CRC is computed over the type and payload 74 // Size is the length of the payload in bytes 75 // Type is the chunk type 76 // 77 // There are four chunk types: whether the chunk is the full record, or the 78 // first, middle or last chunk of a multi-chunk record. A multi-chunk record 79 // has one first chunk, zero or more middle chunks, and one last chunk. 80 // 81 // The recyclyable chunk format is similar to the legacy format, but extends 82 // the chunk header with an additional log number field. This allows reuse 83 // (recycling) of log files which can provide significantly better performance 84 // when syncing frequently as it avoids needing to update the file 85 // metadata. Additionally, recycling log files is a prequisite for using direct 86 // IO with log writing. The recyclyable format is: 87 // 88 // +----------+-----------+-----------+----------------+--- ... ---+ 89 // | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | 90 // +----------+-----------+-----------+----------------+--- ... ---+ 91 // 92 // Recyclable chunks are distinguished from legacy chunks by the addition of 4 93 // extra "recyclable" chunk types that map directly to the legacy chunk types 94 // (i.e. full, first, middle, last). The CRC is computed over the type, log 95 // number, and payload. 96 // 97 // The wire format allows for limited recovery in the face of data corruption: 98 // on a format error (such as a checksum mismatch), the reader moves to the 99 // next block and looks for the next full or first chunk. 100 package record 101 102 // The C++ Level-DB code calls this the log, but it has been renamed to record 103 // to avoid clashing with the standard log package, and because it is generally 104 // useful outside of logging. The C++ code also uses the term "physical record" 105 // instead of "chunk", but "chunk" is shorter and less confusing. 106 107 import ( 108 "encoding/binary" 109 "io" 110 111 "github.com/cockroachdb/errors" 112 "github.com/zuoyebang/bitalostable/internal/base" 113 "github.com/zuoyebang/bitalostable/internal/crc" 114 ) 115 116 // These constants are part of the wire format and should not be changed. 117 const ( 118 fullChunkType = 1 119 firstChunkType = 2 120 middleChunkType = 3 121 lastChunkType = 4 122 123 recyclableFullChunkType = 5 124 recyclableFirstChunkType = 6 125 recyclableMiddleChunkType = 7 126 recyclableLastChunkType = 8 127 ) 128 129 const ( 130 blockSize = 32 * 1024 131 blockSizeMask = blockSize - 1 132 legacyHeaderSize = 7 133 recyclableHeaderSize = legacyHeaderSize + 4 134 ) 135 136 var ( 137 // ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker. 138 ErrNotAnIOSeeker = errors.New("bitalostable/record: reader does not implement io.Seeker") 139 140 // ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record. 141 ErrNoLastRecord = errors.New("bitalostable/record: no last record exists") 142 143 // ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This 144 // usually occurs due to log file preallocation. 145 ErrZeroedChunk = base.CorruptionErrorf("bitalostable/record: zeroed chunk") 146 147 // ErrInvalidChunk is returned if a chunk is encountered with an invalid 148 // header, length, or checksum. This usually occurs when a log is recycled, 149 // but can also occur due to corruption. 150 ErrInvalidChunk = base.CorruptionErrorf("bitalostable/record: invalid chunk") 151 ) 152 153 // IsInvalidRecord returns true if the error matches one of the error types 154 // returned for invalid records. These are treated in a way similar to io.EOF 155 // in recovery code. 156 func IsInvalidRecord(err error) bool { 157 return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF 158 } 159 160 // Reader reads records from an underlying io.Reader. 161 type Reader struct { 162 // r is the underlying reader. 163 r io.Reader 164 // logNum is the low 32-bits of the log's file number. May be zero when used 165 // with log files that do not have a file number (e.g. the MANIFEST). 166 logNum uint32 167 // blockNum is the zero based block number currently held in buf. 168 blockNum int64 169 // seq is the sequence number of the current record. 170 seq int 171 // buf[begin:end] is the unread portion of the current chunk's payload. The 172 // low bound, begin, excludes the chunk header. 173 begin, end int 174 // n is the number of bytes of buf that are valid. Once reading has started, 175 // only the final block can have n < blockSize. 176 n int 177 // recovering is true when recovering from corruption. 178 recovering bool 179 // last is whether the current chunk is the last chunk of the record. 180 last bool 181 // err is any accumulated error. 182 err error 183 // buf is the buffer. 184 buf [blockSize]byte 185 } 186 187 // NewReader returns a new reader. If the file contains records encoded using 188 // the recyclable record format, then the log number in those records must 189 // match the specified logNum. 190 func NewReader(r io.Reader, logNum base.FileNum) *Reader { 191 return &Reader{ 192 r: r, 193 logNum: uint32(logNum), 194 blockNum: -1, 195 } 196 } 197 198 // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the 199 // next block into the buffer if necessary. 200 func (r *Reader) nextChunk(wantFirst bool) error { 201 for { 202 if r.end+legacyHeaderSize <= r.n { 203 checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) 204 length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) 205 chunkType := r.buf[r.end+6] 206 207 if checksum == 0 && length == 0 && chunkType == 0 { 208 if r.end+recyclableHeaderSize > r.n { 209 // Skip the rest of the block if the recyclable header size does not 210 // fit within it. 211 r.end = r.n 212 continue 213 } 214 if r.recovering { 215 // Skip the rest of the block, if it looks like it is all 216 // zeroes. This is common with WAL preallocation. 217 // 218 // Set r.err to be an error so r.recover actually recovers. 219 r.err = ErrZeroedChunk 220 r.recover() 221 continue 222 } 223 return ErrZeroedChunk 224 } 225 226 headerSize := legacyHeaderSize 227 if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType { 228 headerSize = recyclableHeaderSize 229 if r.end+headerSize > r.n { 230 return ErrInvalidChunk 231 } 232 233 logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) 234 if logNum != r.logNum { 235 if wantFirst { 236 // If we're looking for the first chunk of a record, we can treat a 237 // previous instance of the log as EOF. 238 return io.EOF 239 } 240 // Otherwise, treat this chunk as invalid in order to prevent reading 241 // of a partial record. 242 return ErrInvalidChunk 243 } 244 245 chunkType -= (recyclableFullChunkType - 1) 246 } 247 248 r.begin = r.end + headerSize 249 r.end = r.begin + int(length) 250 if r.end > r.n { 251 if r.recovering { 252 r.recover() 253 continue 254 } 255 return ErrInvalidChunk 256 } 257 if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() { 258 if r.recovering { 259 r.recover() 260 continue 261 } 262 return ErrInvalidChunk 263 } 264 if wantFirst { 265 if chunkType != fullChunkType && chunkType != firstChunkType { 266 continue 267 } 268 } 269 r.last = chunkType == fullChunkType || chunkType == lastChunkType 270 r.recovering = false 271 return nil 272 } 273 if r.n < blockSize && r.blockNum >= 0 { 274 if !wantFirst || r.end != r.n { 275 // This can happen if the previous instance of the log ended with a 276 // partial block at the same blockNum as the new log but extended 277 // beyond the partial block of the new log. 278 return ErrInvalidChunk 279 } 280 return io.EOF 281 } 282 n, err := io.ReadFull(r.r, r.buf[:]) 283 if err != nil && err != io.ErrUnexpectedEOF { 284 if err == io.EOF && !wantFirst { 285 return io.ErrUnexpectedEOF 286 } 287 return err 288 } 289 r.begin, r.end, r.n = 0, 0, n 290 r.blockNum++ 291 } 292 } 293 294 // Next returns a reader for the next record. It returns io.EOF if there are no 295 // more records. The reader returned becomes stale after the next Next call, 296 // and should no longer be used. 297 func (r *Reader) Next() (io.Reader, error) { 298 r.seq++ 299 if r.err != nil { 300 return nil, r.err 301 } 302 r.begin = r.end 303 r.err = r.nextChunk(true) 304 if r.err != nil { 305 return nil, r.err 306 } 307 return singleReader{r, r.seq}, nil 308 } 309 310 // Offset returns the current offset within the file. If called immediately 311 // before a call to Next(), Offset() will return the record offset. 312 func (r *Reader) Offset() int64 { 313 if r.blockNum < 0 { 314 return 0 315 } 316 return int64(r.blockNum)*blockSize + int64(r.end) 317 } 318 319 // recover clears any errors read so far, so that calling Next will start 320 // reading from the next good 32KiB block. If there are no such blocks, Next 321 // will return io.EOF. recover also marks the current reader, the one most 322 // recently returned by Next, as stale. If recover is called without any 323 // prior error, then recover is a no-op. 324 func (r *Reader) recover() { 325 if r.err == nil { 326 return 327 } 328 r.recovering = true 329 r.err = nil 330 // Discard the rest of the current block. 331 r.begin, r.end, r.last = r.n, r.n, false 332 // Invalidate any outstanding singleReader. 333 r.seq++ 334 } 335 336 // seekRecord seeks in the underlying io.Reader such that calling r.Next 337 // returns the record whose first chunk header starts at the provided offset. 338 // Its behavior is undefined if the argument given is not such an offset, as 339 // the bytes at that offset may coincidentally appear to be a valid header. 340 // 341 // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement 342 // io.Seeker. 343 // 344 // seekRecord will fail and return an error if the Reader previously 345 // encountered an error, including io.EOF. Such errors can be cleared by 346 // calling Recover. Calling seekRecord after Recover will make calling Next 347 // return the record at the given offset, instead of the record at the next 348 // good 32KiB block as Recover normally would. Calling seekRecord before 349 // Recover has no effect on Recover's semantics other than changing the 350 // starting point for determining the next good 32KiB block. 351 // 352 // The offset is always relative to the start of the underlying io.Reader, so 353 // negative values will result in an error as per io.Seeker. 354 func (r *Reader) seekRecord(offset int64) error { 355 r.seq++ 356 if r.err != nil { 357 return r.err 358 } 359 360 s, ok := r.r.(io.Seeker) 361 if !ok { 362 return ErrNotAnIOSeeker 363 } 364 365 // Only seek to an exact block offset. 366 c := int(offset & blockSizeMask) 367 if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil { 368 return r.err 369 } 370 371 // Clear the state of the internal reader. 372 r.begin, r.end, r.n = 0, 0, 0 373 r.blockNum, r.recovering, r.last = -1, false, false 374 if r.err = r.nextChunk(false); r.err != nil { 375 return r.err 376 } 377 378 // Now skip to the offset requested within the block. A subsequent 379 // call to Next will return the block at the requested offset. 380 r.begin, r.end = c, c 381 382 return nil 383 } 384 385 type singleReader struct { 386 r *Reader 387 seq int 388 } 389 390 func (x singleReader) Read(p []byte) (int, error) { 391 r := x.r 392 if r.seq != x.seq { 393 return 0, errors.New("bitalostable/record: stale reader") 394 } 395 if r.err != nil { 396 return 0, r.err 397 } 398 for r.begin == r.end { 399 if r.last { 400 return 0, io.EOF 401 } 402 if r.err = r.nextChunk(false); r.err != nil { 403 return 0, r.err 404 } 405 } 406 n := copy(p, r.buf[r.begin:r.end]) 407 r.begin += n 408 return n, nil 409 } 410 411 // Writer writes records to an underlying io.Writer. 412 type Writer struct { 413 // w is the underlying writer. 414 w io.Writer 415 // seq is the sequence number of the current record. 416 seq int 417 // f is w as a flusher. 418 f flusher 419 // buf[i:j] is the bytes that will become the current chunk. 420 // The low bound, i, includes the chunk header. 421 i, j int 422 // buf[:written] has already been written to w. 423 // written is zero unless Flush has been called. 424 written int 425 // baseOffset is the base offset in w at which writing started. If 426 // w implements io.Seeker, it's relative to the start of w, 0 otherwise. 427 baseOffset int64 428 // blockNumber is the zero based block number currently held in buf. 429 blockNumber int64 430 // lastRecordOffset is the offset in w where the last record was 431 // written (including the chunk header). It is a relative offset to 432 // baseOffset, thus the absolute offset of the last record is 433 // baseOffset + lastRecordOffset. 434 lastRecordOffset int64 435 // first is whether the current chunk is the first chunk of the record. 436 first bool 437 // pending is whether a chunk is buffered but not yet written. 438 pending bool 439 // err is any accumulated error. 440 err error 441 // buf is the buffer. 442 buf [blockSize]byte 443 } 444 445 // NewWriter returns a new Writer. 446 func NewWriter(w io.Writer) *Writer { 447 f, _ := w.(flusher) 448 449 var o int64 450 if s, ok := w.(io.Seeker); ok { 451 var err error 452 if o, err = s.Seek(0, io.SeekCurrent); err != nil { 453 o = 0 454 } 455 } 456 return &Writer{ 457 w: w, 458 f: f, 459 baseOffset: o, 460 lastRecordOffset: -1, 461 } 462 } 463 464 // fillHeader fills in the header for the pending chunk. 465 func (w *Writer) fillHeader(last bool) { 466 if w.i+legacyHeaderSize > w.j || w.j > blockSize { 467 panic("bitalostable/record: bad writer state") 468 } 469 if last { 470 if w.first { 471 w.buf[w.i+6] = fullChunkType 472 } else { 473 w.buf[w.i+6] = lastChunkType 474 } 475 } else { 476 if w.first { 477 w.buf[w.i+6] = firstChunkType 478 } else { 479 w.buf[w.i+6] = middleChunkType 480 } 481 } 482 binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value()) 483 binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize)) 484 } 485 486 // writeBlock writes the buffered block to the underlying writer, and reserves 487 // space for the next chunk's header. 488 func (w *Writer) writeBlock() { 489 _, w.err = w.w.Write(w.buf[w.written:]) 490 w.i = 0 491 w.j = legacyHeaderSize 492 w.written = 0 493 w.blockNumber++ 494 } 495 496 // writePending finishes the current record and writes the buffer to the 497 // underlying writer. 498 func (w *Writer) writePending() { 499 if w.err != nil { 500 return 501 } 502 if w.pending { 503 w.fillHeader(true) 504 w.pending = false 505 } 506 _, w.err = w.w.Write(w.buf[w.written:w.j]) 507 w.written = w.j 508 } 509 510 // Close finishes the current record and closes the writer. 511 func (w *Writer) Close() error { 512 w.seq++ 513 w.writePending() 514 if w.err != nil { 515 return w.err 516 } 517 w.err = errors.New("bitalostable/record: closed Writer") 518 return nil 519 } 520 521 // Flush finishes the current record, writes to the underlying writer, and 522 // flushes it if that writer implements interface{ Flush() error }. 523 func (w *Writer) Flush() error { 524 w.seq++ 525 w.writePending() 526 if w.err != nil { 527 return w.err 528 } 529 if w.f != nil { 530 w.err = w.f.Flush() 531 return w.err 532 } 533 return nil 534 } 535 536 // Next returns a writer for the next record. The writer returned becomes stale 537 // after the next Close, Flush or Next call, and should no longer be used. 538 func (w *Writer) Next() (io.Writer, error) { 539 w.seq++ 540 if w.err != nil { 541 return nil, w.err 542 } 543 if w.pending { 544 w.fillHeader(true) 545 } 546 w.i = w.j 547 w.j = w.j + legacyHeaderSize 548 // Check if there is room in the block for the header. 549 if w.j > blockSize { 550 // Fill in the rest of the block with zeroes. 551 for k := w.i; k < blockSize; k++ { 552 w.buf[k] = 0 553 } 554 w.writeBlock() 555 if w.err != nil { 556 return nil, w.err 557 } 558 } 559 w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i) 560 w.first = true 561 w.pending = true 562 return singleWriter{w, w.seq}, nil 563 } 564 565 // WriteRecord writes a complete record. Returns the offset just past the end 566 // of the record. 567 func (w *Writer) WriteRecord(p []byte) (int64, error) { 568 if w.err != nil { 569 return -1, w.err 570 } 571 t, err := w.Next() 572 if err != nil { 573 return -1, err 574 } 575 if _, err := t.Write(p); err != nil { 576 return -1, err 577 } 578 w.writePending() 579 offset := w.blockNumber*blockSize + int64(w.j) 580 return offset, w.err 581 } 582 583 // Size returns the current size of the file. 584 func (w *Writer) Size() int64 { 585 if w == nil { 586 return 0 587 } 588 return w.blockNumber*blockSize + int64(w.j) 589 } 590 591 // LastRecordOffset returns the offset in the underlying io.Writer of the last 592 // record so far - the one created by the most recent Next call. It is the 593 // offset of the first chunk header, suitable to pass to Reader.SeekRecord. 594 // 595 // If that io.Writer also implements io.Seeker, the return value is an absolute 596 // offset, in the sense of io.SeekStart, regardless of whether the io.Writer 597 // was initially at the zero position when passed to NewWriter. Otherwise, the 598 // return value is a relative offset, being the number of bytes written between 599 // the NewWriter call and any records written prior to the last record. 600 // 601 // If there is no last record, i.e. nothing was written, LastRecordOffset will 602 // return ErrNoLastRecord. 603 func (w *Writer) LastRecordOffset() (int64, error) { 604 if w.err != nil { 605 return 0, w.err 606 } 607 if w.lastRecordOffset < 0 { 608 return 0, ErrNoLastRecord 609 } 610 return w.lastRecordOffset, nil 611 } 612 613 type singleWriter struct { 614 w *Writer 615 seq int 616 } 617 618 func (x singleWriter) Write(p []byte) (int, error) { 619 w := x.w 620 if w.seq != x.seq { 621 return 0, errors.New("bitalostable/record: stale writer") 622 } 623 if w.err != nil { 624 return 0, w.err 625 } 626 n0 := len(p) 627 for len(p) > 0 { 628 // Write a block, if it is full. 629 if w.j == blockSize { 630 w.fillHeader(false) 631 w.writeBlock() 632 if w.err != nil { 633 return 0, w.err 634 } 635 w.first = false 636 } 637 // Copy bytes into the buffer. 638 n := copy(w.buf[w.j:], p) 639 w.j += n 640 p = p[n:] 641 } 642 return n0, nil 643 }