github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/record/record.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package record 16 17 import ( 18 "encoding/binary" 19 "io" 20 21 "github.com/zuoyebang/bitalosdb/internal/base" 22 "github.com/zuoyebang/bitalosdb/internal/crc" 23 24 "github.com/cockroachdb/errors" 25 ) 26 27 const ( 28 fullChunkType = 1 29 firstChunkType = 2 30 middleChunkType = 3 31 lastChunkType = 4 32 33 recyclableFullChunkType = 5 34 recyclableFirstChunkType = 6 35 recyclableMiddleChunkType = 7 36 recyclableLastChunkType = 8 37 ) 38 39 const ( 40 blockSize = 32 * 1024 41 blockSizeMask = blockSize - 1 42 legacyHeaderSize = 7 43 recyclableHeaderSize = legacyHeaderSize + 4 44 ) 45 46 var ( 47 // ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker. 48 ErrNotAnIOSeeker = errors.New("bitalosdb/record: reader does not implement io.Seeker") 49 50 // ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record. 51 ErrNoLastRecord = errors.New("bitalosdb/record: no last record exists") 52 53 // ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This 54 // usually occurs due to log file preallocation. 55 ErrZeroedChunk = base.CorruptionErrorf("bitalosdb/record: zeroed chunk") 56 57 // ErrInvalidChunk is returned if a chunk is encountered with an invalid 58 // header, length, or checksum. This usually occurs when a log is recycled, 59 // but can also occur due to corruption. 60 ErrInvalidChunk = base.CorruptionErrorf("bitalosdb/record: invalid chunk") 61 ) 62 63 // IsInvalidRecord returns true if the error matches one of the error types 64 // returned for invalid records. These are treated in a way similar to io.EOF 65 // in recovery code. 66 func IsInvalidRecord(err error) bool { 67 return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF 68 } 69 70 // Reader reads records from an underlying io.Reader. 71 type Reader struct { 72 // r is the underlying reader. 73 r io.Reader 74 // logNum is the low 32-bits of the log's file number. May be zero when used 75 // with log files that do not have a file number (e.g. the MANIFEST). 76 logNum uint32 77 // blockNum is the zero based block number currently held in buf. 78 blockNum int64 79 // seq is the sequence number of the current record. 80 seq int 81 // buf[begin:end] is the unread portion of the current chunk's payload. The 82 // low bound, begin, excludes the chunk header. 83 begin, end int 84 // n is the number of bytes of buf that are valid. Once reading has started, 85 // only the final block can have n < blockSize. 86 n int 87 // recovering is true when recovering from corruption. 88 recovering bool 89 // last is whether the current chunk is the last chunk of the record. 90 last bool 91 // err is any accumulated error. 92 err error 93 // buf is the buffer. 94 buf [blockSize]byte 95 } 96 97 // NewReader returns a new reader. If the file contains records encoded using 98 // the recyclable record format, then the log number in those records must 99 // match the specified logNum. 100 func NewReader(r io.Reader, logNum base.FileNum) *Reader { 101 return &Reader{ 102 r: r, 103 logNum: uint32(logNum), 104 blockNum: -1, 105 } 106 } 107 108 // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the 109 // next block into the buffer if necessary. 110 func (r *Reader) nextChunk(wantFirst bool) error { 111 for { 112 if r.end+legacyHeaderSize <= r.n { 113 checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) 114 length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) 115 chunkType := r.buf[r.end+6] 116 117 if checksum == 0 && length == 0 && chunkType == 0 { 118 if r.end+recyclableHeaderSize > r.n { 119 // Skip the rest of the block if the recyclable header size does not 120 // fit within it. 121 r.end = r.n 122 continue 123 } 124 if r.recovering { 125 // Skip the rest of the block, if it looks like it is all 126 // zeroes. This is common with WAL preallocation. 127 // 128 // Set r.err to be an error so r.recover actually recovers. 129 r.err = ErrZeroedChunk 130 r.recover() 131 continue 132 } 133 return ErrZeroedChunk 134 } 135 136 headerSize := legacyHeaderSize 137 if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType { 138 headerSize = recyclableHeaderSize 139 if r.end+headerSize > r.n { 140 return ErrInvalidChunk 141 } 142 143 logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) 144 if logNum != r.logNum { 145 if wantFirst { 146 // If we're looking for the first chunk of a record, we can treat a 147 // previous instance of the log as EOF. 148 return io.EOF 149 } 150 // Otherwise, treat this chunk as invalid in order to prevent reading 151 // of a partial record. 152 return ErrInvalidChunk 153 } 154 155 chunkType -= recyclableFullChunkType - 1 156 } 157 158 r.begin = r.end + headerSize 159 r.end = r.begin + int(length) 160 if r.end > r.n { 161 if r.recovering { 162 r.recover() 163 continue 164 } 165 return ErrInvalidChunk 166 } 167 if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() { 168 if r.recovering { 169 r.recover() 170 continue 171 } 172 return ErrInvalidChunk 173 } 174 if wantFirst { 175 if chunkType != fullChunkType && chunkType != firstChunkType { 176 continue 177 } 178 } 179 r.last = chunkType == fullChunkType || chunkType == lastChunkType 180 r.recovering = false 181 return nil 182 } 183 if r.n < blockSize && r.blockNum >= 0 { 184 if !wantFirst || r.end != r.n { 185 // This can happen if the previous instance of the log ended with a 186 // partial block at the same blockNum as the new log but extended 187 // beyond the partial block of the new log. 188 return ErrInvalidChunk 189 } 190 return io.EOF 191 } 192 n, err := io.ReadFull(r.r, r.buf[:]) 193 if err != nil && err != io.ErrUnexpectedEOF { 194 if err == io.EOF && !wantFirst { 195 return io.ErrUnexpectedEOF 196 } 197 return err 198 } 199 r.begin, r.end, r.n = 0, 0, n 200 r.blockNum++ 201 } 202 } 203 204 // Next returns a reader for the next record. It returns io.EOF if there are no 205 // more records. The reader returned becomes stale after the next Next call, 206 // and should no longer be used. 207 func (r *Reader) Next() (io.Reader, error) { 208 r.seq++ 209 if r.err != nil { 210 return nil, r.err 211 } 212 r.begin = r.end 213 r.err = r.nextChunk(true) 214 if r.err != nil { 215 return nil, r.err 216 } 217 return singleReader{r, r.seq}, nil 218 } 219 220 // Offset returns the current offset within the file. If called immediately 221 // before a call to Next(), Offset() will return the record offset. 222 func (r *Reader) Offset() int64 { 223 if r.blockNum < 0 { 224 return 0 225 } 226 return int64(r.blockNum)*blockSize + int64(r.end) 227 } 228 229 // recover clears any errors read so far, so that calling Next will start 230 // reading from the next good 32KiB block. If there are no such blocks, Next 231 // will return io.EOF. recover also marks the current reader, the one most 232 // recently returned by Next, as stale. If recover is called without any 233 // prior error, then recover is a no-op. 234 func (r *Reader) recover() { 235 if r.err == nil { 236 return 237 } 238 r.recovering = true 239 r.err = nil 240 // Discard the rest of the current block. 241 r.begin, r.end, r.last = r.n, r.n, false 242 // Invalidate any outstanding singleReader. 243 r.seq++ 244 } 245 246 // seekRecord seeks in the underlying io.Reader such that calling r.Next 247 // returns the record whose first chunk header starts at the provided offset. 248 // Its behavior is undefined if the argument given is not such an offset, as 249 // the bytes at that offset may coincidentally appear to be a valid header. 250 // 251 // It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement 252 // io.Seeker. 253 // 254 // seekRecord will fail and return an error if the Reader previously 255 // encountered an error, including io.EOF. Such errors can be cleared by 256 // calling Recover. Calling seekRecord after Recover will make calling Next 257 // return the record at the given offset, instead of the record at the next 258 // good 32KiB block as Recover normally would. Calling seekRecord before 259 // Recover has no effect on Recover's semantics other than changing the 260 // starting point for determining the next good 32KiB block. 261 // 262 // The offset is always relative to the start of the underlying io.Reader, so 263 // negative values will result in an error as per io.Seeker. 264 func (r *Reader) seekRecord(offset int64) error { 265 r.seq++ 266 if r.err != nil { 267 return r.err 268 } 269 270 s, ok := r.r.(io.Seeker) 271 if !ok { 272 return ErrNotAnIOSeeker 273 } 274 275 // Only seek to an exact block offset. 276 c := int(offset & blockSizeMask) 277 if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil { 278 return r.err 279 } 280 281 // Clear the state of the internal reader. 282 r.begin, r.end, r.n = 0, 0, 0 283 r.blockNum, r.recovering, r.last = -1, false, false 284 if r.err = r.nextChunk(false); r.err != nil { 285 return r.err 286 } 287 288 // Now skip to the offset requested within the block. A subsequent 289 // call to Next will return the block at the requested offset. 290 r.begin, r.end = c, c 291 292 return nil 293 } 294 295 type singleReader struct { 296 r *Reader 297 seq int 298 } 299 300 func (x singleReader) Read(p []byte) (int, error) { 301 r := x.r 302 if r.seq != x.seq { 303 return 0, errors.New("bitalosdb/record: stale reader") 304 } 305 if r.err != nil { 306 return 0, r.err 307 } 308 for r.begin == r.end { 309 if r.last { 310 return 0, io.EOF 311 } 312 if r.err = r.nextChunk(false); r.err != nil { 313 return 0, r.err 314 } 315 } 316 n := copy(p, r.buf[r.begin:r.end]) 317 r.begin += n 318 return n, nil 319 } 320 321 // Writer writes records to an underlying io.Writer. 322 type Writer struct { 323 // w is the underlying writer. 324 w io.Writer 325 // seq is the sequence number of the current record. 326 seq int 327 // f is w as a flusher. 328 f flusher 329 // buf[i:j] is the bytes that will become the current chunk. 330 // The low bound, i, includes the chunk header. 331 i, j int 332 // buf[:written] has already been written to w. 333 // written is zero unless Flush has been called. 334 written int 335 // baseOffset is the base offset in w at which writing started. If 336 // w implements io.Seeker, it's relative to the start of w, 0 otherwise. 337 baseOffset int64 338 // blockNumber is the zero based block number currently held in buf. 339 blockNumber int64 340 // lastRecordOffset is the offset in w where the last record was 341 // written (including the chunk header). It is a relative offset to 342 // baseOffset, thus the absolute offset of the last record is 343 // baseOffset + lastRecordOffset. 344 lastRecordOffset int64 345 // first is whether the current chunk is the first chunk of the record. 346 first bool 347 // pending is whether a chunk is buffered but not yet written. 348 pending bool 349 // err is any accumulated error. 350 err error 351 // buf is the buffer. 352 buf [blockSize]byte 353 } 354 355 // NewWriter returns a new Writer. 356 func NewWriter(w io.Writer) *Writer { 357 f, _ := w.(flusher) 358 359 var o int64 360 if s, ok := w.(io.Seeker); ok { 361 var err error 362 if o, err = s.Seek(0, io.SeekCurrent); err != nil { 363 o = 0 364 } 365 } 366 return &Writer{ 367 w: w, 368 f: f, 369 baseOffset: o, 370 lastRecordOffset: -1, 371 } 372 } 373 374 // fillHeader fills in the header for the pending chunk. 375 func (w *Writer) fillHeader(last bool) { 376 if w.i+legacyHeaderSize > w.j || w.j > blockSize { 377 panic("bitalosdb/record: bad writer state") 378 } 379 if last { 380 if w.first { 381 w.buf[w.i+6] = fullChunkType 382 } else { 383 w.buf[w.i+6] = lastChunkType 384 } 385 } else { 386 if w.first { 387 w.buf[w.i+6] = firstChunkType 388 } else { 389 w.buf[w.i+6] = middleChunkType 390 } 391 } 392 binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value()) 393 binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize)) 394 } 395 396 // writeBlock writes the buffered block to the underlying writer, and reserves 397 // space for the next chunk's header. 398 func (w *Writer) writeBlock() { 399 _, w.err = w.w.Write(w.buf[w.written:]) 400 w.i = 0 401 w.j = legacyHeaderSize 402 w.written = 0 403 w.blockNumber++ 404 } 405 406 // writePending finishes the current record and writes the buffer to the 407 // underlying writer. 408 func (w *Writer) writePending() { 409 if w.err != nil { 410 return 411 } 412 if w.pending { 413 w.fillHeader(true) 414 w.pending = false 415 } 416 _, w.err = w.w.Write(w.buf[w.written:w.j]) 417 w.written = w.j 418 } 419 420 // Close finishes the current record and closes the writer. 421 func (w *Writer) Close() error { 422 w.seq++ 423 w.writePending() 424 if w.err != nil { 425 return w.err 426 } 427 w.err = errors.New("bitalosdb/record: closed Writer") 428 return nil 429 } 430 431 // Flush finishes the current record, writes to the underlying writer, and 432 // flushes it if that writer implements interface{ Flush() error }. 433 func (w *Writer) Flush() error { 434 w.seq++ 435 w.writePending() 436 if w.err != nil { 437 return w.err 438 } 439 if w.f != nil { 440 w.err = w.f.Flush() 441 return w.err 442 } 443 return nil 444 } 445 446 // Next returns a writer for the next record. The writer returned becomes stale 447 // after the next Close, Flush or Next call, and should no longer be used. 448 func (w *Writer) Next() (io.Writer, error) { 449 w.seq++ 450 if w.err != nil { 451 return nil, w.err 452 } 453 if w.pending { 454 w.fillHeader(true) 455 } 456 w.i = w.j 457 w.j = w.j + legacyHeaderSize 458 // Check if there is room in the block for the header. 459 if w.j > blockSize { 460 // Fill in the rest of the block with zeroes. 461 for k := w.i; k < blockSize; k++ { 462 w.buf[k] = 0 463 } 464 w.writeBlock() 465 if w.err != nil { 466 return nil, w.err 467 } 468 } 469 w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i) 470 w.first = true 471 w.pending = true 472 return singleWriter{w, w.seq}, nil 473 } 474 475 // WriteRecord writes a complete record. Returns the offset just past the end 476 // of the record. 477 func (w *Writer) WriteRecord(p []byte) (int64, error) { 478 if w.err != nil { 479 return -1, w.err 480 } 481 t, err := w.Next() 482 if err != nil { 483 return -1, err 484 } 485 if _, err := t.Write(p); err != nil { 486 return -1, err 487 } 488 w.writePending() 489 offset := w.blockNumber*blockSize + int64(w.j) 490 return offset, w.err 491 } 492 493 // Size returns the current size of the file. 494 func (w *Writer) Size() int64 { 495 if w == nil { 496 return 0 497 } 498 return w.blockNumber*blockSize + int64(w.j) 499 } 500 501 // LastRecordOffset returns the offset in the underlying io.Writer of the last 502 // record so far - the one created by the most recent Next call. It is the 503 // offset of the first chunk header, suitable to pass to Reader.SeekRecord. 504 // 505 // If that io.Writer also implements io.Seeker, the return value is an absolute 506 // offset, in the sense of io.SeekStart, regardless of whether the io.Writer 507 // was initially at the zero position when passed to NewWriter. Otherwise, the 508 // return value is a relative offset, being the number of bytes written between 509 // the NewWriter call and any records written prior to the last record. 510 // 511 // If there is no last record, i.e. nothing was written, LastRecordOffset will 512 // return ErrNoLastRecord. 513 func (w *Writer) LastRecordOffset() (int64, error) { 514 if w.err != nil { 515 return 0, w.err 516 } 517 if w.lastRecordOffset < 0 { 518 return 0, ErrNoLastRecord 519 } 520 return w.lastRecordOffset, nil 521 } 522 523 type singleWriter struct { 524 w *Writer 525 seq int 526 } 527 528 func (x singleWriter) Write(p []byte) (int, error) { 529 w := x.w 530 if w.seq != x.seq { 531 return 0, errors.New("bitalosdb/record: stale writer") 532 } 533 if w.err != nil { 534 return 0, w.err 535 } 536 n0 := len(p) 537 for len(p) > 0 { 538 // Write a block, if it is full. 539 if w.j == blockSize { 540 w.fillHeader(false) 541 w.writeBlock() 542 if w.err != nil { 543 return 0, w.err 544 } 545 w.first = false 546 } 547 // Copy bytes into the buffer. 548 n := copy(w.buf[w.j:], p) 549 w.j += n 550 p = p[n:] 551 } 552 return n0, nil 553 }