github.com/ethereum/go-ethereum@v1.16.1/core/rawdb/freezer_table.go (about) 1 // Copyright 2019 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package rawdb 18 19 import ( 20 "bufio" 21 "bytes" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 "io" 26 "os" 27 "path/filepath" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "github.com/ethereum/go-ethereum/common" 33 "github.com/ethereum/go-ethereum/log" 34 "github.com/ethereum/go-ethereum/metrics" 35 "github.com/golang/snappy" 36 ) 37 38 var ( 39 // errClosed is returned if an operation attempts to read from or write to the 40 // freezer table after it has already been closed. 41 errClosed = errors.New("closed") 42 43 // errOutOfBounds is returned if the item requested is not contained within the 44 // freezer table. 45 errOutOfBounds = errors.New("out of bounds") 46 47 // errNotSupported is returned if the database doesn't support the required operation. 48 errNotSupported = errors.New("this operation is not supported") 49 ) 50 51 // indexEntry contains the number/id of the file that the data resides in, as well as the 52 // offset within the file to the end of the data. 53 // In serialized form, the filenum is stored as uint16. 54 type indexEntry struct { 55 filenum uint32 // stored as uint16 ( 2 bytes ) 56 offset uint32 // stored as uint32 ( 4 bytes ) 57 } 58 59 const indexEntrySize = 6 60 61 // unmarshalBinary deserializes binary b into the rawIndex entry. 62 func (i *indexEntry) unmarshalBinary(b []byte) { 63 i.filenum = uint32(binary.BigEndian.Uint16(b[:2])) 64 i.offset = binary.BigEndian.Uint32(b[2:6]) 65 } 66 67 // append adds the encoded entry to the end of b. 68 func (i *indexEntry) append(b []byte) []byte { 69 offset := len(b) 70 out := append(b, make([]byte, indexEntrySize)...) 71 binary.BigEndian.PutUint16(out[offset:], uint16(i.filenum)) 72 binary.BigEndian.PutUint32(out[offset+2:], i.offset) 73 return out 74 } 75 76 // bounds returns the start- and end- offsets, and the file number of where to 77 // read there data item marked by the two index entries. The two entries are 78 // assumed to be sequential. 79 func (i *indexEntry) bounds(end *indexEntry) (startOffset, endOffset, fileId uint32) { 80 if i.filenum != end.filenum { 81 // If a piece of data 'crosses' a data-file, 82 // it's actually in one piece on the second data-file. 83 // We return a zero-indexEntry for the second file as start 84 return 0, end.offset, end.filenum 85 } 86 return i.offset, end.offset, end.filenum 87 } 88 89 // freezerTable represents a single chained data table within the freezer (e.g. blocks). 90 // It consists of a data file (snappy encoded arbitrary data blobs) and an indexEntry 91 // file (uncompressed 64 bit indices into the data file). 92 type freezerTable struct { 93 items atomic.Uint64 // Number of items stored in the table (including items removed from tail) 94 itemOffset atomic.Uint64 // Number of items removed from the table 95 96 // itemHidden is the number of items marked as deleted. Tail deletion is 97 // only supported at file level which means the actual deletion will be 98 // delayed until the entire data file is marked as deleted. Before that 99 // these items will be hidden to prevent being visited again. The value 100 // should never be lower than itemOffset. 101 itemHidden atomic.Uint64 102 103 config freezerTableConfig // if true, disables snappy compression. Note: does not work retroactively 104 readonly bool 105 maxFileSize uint32 // Max file size for data-files 106 name string 107 path string 108 109 head *os.File // File descriptor for the data head of the table 110 index *os.File // File descriptor for the indexEntry file of the table 111 files map[uint32]*os.File // open files 112 headId uint32 // number of the currently active head file 113 tailId uint32 // number of the earliest file 114 115 metadata *freezerTableMeta // metadata of the table 116 lastSync time.Time // Timestamp when the last sync was performed 117 118 headBytes int64 // Number of bytes written to the head file 119 readMeter *metrics.Meter // Meter for measuring the effective amount of data read 120 writeMeter *metrics.Meter // Meter for measuring the effective amount of data written 121 sizeGauge *metrics.Gauge // Gauge for tracking the combined size of all freezer tables 122 123 logger log.Logger // Logger with database path and table name embedded 124 lock sync.RWMutex // Mutex protecting the data file descriptors 125 } 126 127 // newFreezerTable opens the given path as a freezer table. 128 func newFreezerTable(path, name string, config freezerTableConfig, readonly bool) (*freezerTable, error) { 129 return newTable(path, name, metrics.NewInactiveMeter(), metrics.NewInactiveMeter(), metrics.NewGauge(), freezerTableSize, config, readonly) 130 } 131 132 // newTable opens a freezer table, creating the data and index files if they are 133 // non-existent. Both files are truncated to the shortest common length to ensure 134 // they don't go out of sync. 135 func newTable(path string, name string, readMeter, writeMeter *metrics.Meter, sizeGauge *metrics.Gauge, maxFilesize uint32, config freezerTableConfig, readonly bool) (*freezerTable, error) { 136 // Ensure the containing directory exists and open the indexEntry file 137 if err := os.MkdirAll(path, 0755); err != nil { 138 return nil, err 139 } 140 var idxName string 141 if config.noSnappy { 142 idxName = fmt.Sprintf("%s.ridx", name) // raw index file 143 } else { 144 idxName = fmt.Sprintf("%s.cidx", name) // compressed index file 145 } 146 var ( 147 err error 148 index *os.File 149 meta *os.File 150 ) 151 if readonly { 152 // Will fail if table index file or meta file is not existent 153 index, err = openFreezerFileForReadOnly(filepath.Join(path, idxName)) 154 if err != nil { 155 return nil, err 156 } 157 meta, err = openFreezerFileForReadOnly(filepath.Join(path, fmt.Sprintf("%s.meta", name))) 158 if err != nil { 159 return nil, err 160 } 161 } else { 162 index, err = openFreezerFileForAppend(filepath.Join(path, idxName)) 163 if err != nil { 164 return nil, err 165 } 166 meta, err = openFreezerFileForAppend(filepath.Join(path, fmt.Sprintf("%s.meta", name))) 167 if err != nil { 168 return nil, err 169 } 170 } 171 // Load metadata from the file. The tag will be true if legacy metadata 172 // is detected. 173 metadata, err := newMetadata(meta) 174 if err != nil { 175 return nil, err 176 } 177 // Create the table and repair any past inconsistency 178 tab := &freezerTable{ 179 index: index, 180 metadata: metadata, 181 lastSync: time.Now(), 182 files: make(map[uint32]*os.File), 183 readMeter: readMeter, 184 writeMeter: writeMeter, 185 sizeGauge: sizeGauge, 186 name: name, 187 path: path, 188 logger: log.New("database", path, "table", name), 189 config: config, 190 readonly: readonly, 191 maxFileSize: maxFilesize, 192 } 193 if err := tab.repair(); err != nil { 194 tab.Close() 195 return nil, err 196 } 197 // Initialize the starting size counter 198 size, err := tab.sizeNolock() 199 if err != nil { 200 tab.Close() 201 return nil, err 202 } 203 tab.sizeGauge.Inc(int64(size)) 204 205 return tab, nil 206 } 207 208 // repair cross-checks the head and the index file and truncates them to 209 // be in sync with each other after a potential crash / data loss. 210 func (t *freezerTable) repair() error { 211 // Create a temporary offset buffer to init files with and read indexEntry into 212 buffer := make([]byte, indexEntrySize) 213 214 // If we've just created the files, initialize the index with the 0 indexEntry 215 stat, err := t.index.Stat() 216 if err != nil { 217 return err 218 } 219 if stat.Size() == 0 { 220 if _, err := t.index.Write(buffer); err != nil { 221 return err 222 } 223 } 224 // Ensure the index is a multiple of indexEntrySize bytes 225 if overflow := stat.Size() % indexEntrySize; overflow != 0 { 226 if t.readonly { 227 return fmt.Errorf("index file(path: %s, name: %s) size is not a multiple of %d", t.path, t.name, indexEntrySize) 228 } 229 if err := truncateFreezerFile(t.index, stat.Size()-overflow); err != nil { 230 return err 231 } // New file can't trigger this path 232 } 233 if err := t.repairIndex(); err != nil { 234 return err 235 } 236 // Retrieve the file sizes and prepare for truncation. Note the file size 237 // might be changed after index repair. 238 if stat, err = t.index.Stat(); err != nil { 239 return err 240 } 241 offsetsSize := stat.Size() 242 243 // Open the head file 244 var ( 245 firstIndex indexEntry 246 lastIndex indexEntry 247 contentSize int64 248 contentExp int64 249 verbose bool 250 ) 251 // Read index zero, determine what file is the earliest 252 // and what item offset to use 253 t.index.ReadAt(buffer, 0) 254 firstIndex.unmarshalBinary(buffer) 255 256 // Assign the tail fields with the first stored index. 257 // The total removed items is represented with an uint32, 258 // which is not enough in theory but enough in practice. 259 // TODO: use uint64 to represent total removed items. 260 t.tailId = firstIndex.filenum 261 t.itemOffset.Store(uint64(firstIndex.offset)) 262 263 // Adjust the number of hidden items if it is less than the number of items 264 // being removed. 265 if t.itemOffset.Load() > t.metadata.virtualTail { 266 if err := t.metadata.setVirtualTail(t.itemOffset.Load(), true); err != nil { 267 return err 268 } 269 } 270 t.itemHidden.Store(t.metadata.virtualTail) 271 272 // Read the last index, use the default value in case the freezer is empty 273 if offsetsSize == indexEntrySize { 274 lastIndex = indexEntry{filenum: t.tailId, offset: 0} 275 } else { 276 t.index.ReadAt(buffer, offsetsSize-indexEntrySize) 277 lastIndex.unmarshalBinary(buffer) 278 } 279 if t.readonly { 280 t.head, err = t.openFile(lastIndex.filenum, openFreezerFileForReadOnly) 281 } else { 282 t.head, err = t.openFile(lastIndex.filenum, openFreezerFileForAppend) 283 } 284 if err != nil { 285 return err 286 } 287 if stat, err = t.head.Stat(); err != nil { 288 return err 289 } 290 contentSize = stat.Size() 291 292 // Keep truncating both files until they come in sync 293 contentExp = int64(lastIndex.offset) 294 for contentExp != contentSize { 295 if t.readonly { 296 return fmt.Errorf("freezer table(path: %s, name: %s, num: %d) is corrupted", t.path, t.name, lastIndex.filenum) 297 } 298 verbose = true 299 300 // Truncate the head file to the last offset pointer 301 if contentExp < contentSize { 302 t.logger.Warn("Truncating dangling head", "indexed", contentExp, "stored", contentSize) 303 if err := truncateFreezerFile(t.head, contentExp); err != nil { 304 return err 305 } 306 contentSize = contentExp 307 } 308 // Truncate the index to point within the head file 309 if contentExp > contentSize { 310 t.logger.Warn("Truncating dangling indexes", "indexes", offsetsSize/indexEntrySize, "indexed", contentExp, "stored", contentSize) 311 312 newOffset := offsetsSize - indexEntrySize 313 if err := truncateFreezerFile(t.index, newOffset); err != nil { 314 return err 315 } 316 offsetsSize -= indexEntrySize 317 318 // If the index file is truncated beyond the flush offset, move the flush 319 // offset back to the new end of the file. A crash may occur before the 320 // offset is updated, leaving a dangling reference that points to a position 321 // outside the file. If so, the offset will be reset to the new end of the 322 // file during the next run. 323 if t.metadata.flushOffset > newOffset { 324 if err := t.metadata.setFlushOffset(newOffset, true); err != nil { 325 return err 326 } 327 } 328 // Read the new head index, use the default value in case 329 // the freezer is already empty. 330 var newLastIndex indexEntry 331 if offsetsSize == indexEntrySize { 332 newLastIndex = indexEntry{filenum: t.tailId, offset: 0} 333 } else { 334 t.index.ReadAt(buffer, offsetsSize-indexEntrySize) 335 newLastIndex.unmarshalBinary(buffer) 336 } 337 // We might have slipped back into an earlier head-file here 338 if newLastIndex.filenum != lastIndex.filenum { 339 // Release earlier opened file 340 t.releaseFile(lastIndex.filenum) 341 if t.head, err = t.openFile(newLastIndex.filenum, openFreezerFileForAppend); err != nil { 342 return err 343 } 344 if stat, err = t.head.Stat(); err != nil { 345 // TODO, anything more we can do here? 346 // A data file has gone missing... 347 return err 348 } 349 contentSize = stat.Size() 350 } 351 lastIndex = newLastIndex 352 contentExp = int64(lastIndex.offset) 353 } 354 } 355 // Sync() fails for read-only files on windows. 356 if !t.readonly { 357 // Ensure all reparation changes have been written to disk 358 if err := t.index.Sync(); err != nil { 359 return err 360 } 361 if err := t.head.Sync(); err != nil { 362 return err 363 } 364 if err := t.metadata.file.Sync(); err != nil { 365 return err 366 } 367 } 368 // Update the item and byte counters and return 369 t.items.Store(t.itemOffset.Load() + uint64(offsetsSize/indexEntrySize-1)) // last indexEntry points to the end of the data file 370 t.headBytes = contentSize 371 t.headId = lastIndex.filenum 372 373 // Delete the leftover files because of head deletion 374 t.releaseFilesAfter(t.headId, true) 375 376 // Delete the leftover files because of tail deletion 377 t.releaseFilesBefore(t.tailId, true) 378 379 // Close opened files and preopen all files 380 if err := t.preopen(); err != nil { 381 return err 382 } 383 if verbose { 384 t.logger.Info("Chain freezer table opened", "items", t.items.Load(), "deleted", t.itemOffset.Load(), "hidden", t.itemHidden.Load(), "tailId", t.tailId, "headId", t.headId, "size", t.headBytes) 385 } else { 386 t.logger.Debug("Chain freezer table opened", "items", t.items.Load(), "size", common.StorageSize(t.headBytes)) 387 } 388 return nil 389 } 390 391 func (t *freezerTable) repairIndex() error { 392 stat, err := t.index.Stat() 393 if err != nil { 394 return err 395 } 396 size := stat.Size() 397 398 // Validate the items in the index file to ensure the data integrity. 399 // It's possible some garbage data is retained in the index file after 400 // the power failures and should be truncated first. 401 size, err = t.checkIndex(size) 402 if err != nil { 403 return err 404 } 405 // If legacy metadata is detected, attempt to recover the offset from the 406 // index file to avoid clearing the entire table. 407 if t.metadata.version == freezerTableV1 { 408 // Skip truncation if the legacy metadata is opened in read-only mode. 409 // Since all items in the legacy index file were forcibly synchronized, 410 // data integrity is guaranteed. Therefore, it's safe to leave any extra 411 // items untruncated in this special scenario. 412 if t.readonly { 413 return nil 414 } 415 t.logger.Info("Recovering freezer flushOffset for legacy table", "offset", size) 416 return t.metadata.setFlushOffset(size, true) 417 } 418 419 switch { 420 case size == indexEntrySize && t.metadata.flushOffset == 0: 421 // It's a new freezer table with no content. 422 // Move the flush offset to the end of the file. 423 return t.metadata.setFlushOffset(size, true) 424 425 case size == t.metadata.flushOffset: 426 // flushOffset is aligned with the index file, all is well. 427 return nil 428 429 case size > t.metadata.flushOffset: 430 // Extra index items have been detected beyond the flush offset. Since these 431 // entries correspond to data that has not been fully flushed to disk in the 432 // last run (because of unclean shutdown), their integrity cannot be guaranteed. 433 // To ensure consistency, these index items will be truncated, as there is no 434 // reliable way to validate or recover their associated data. 435 extraSize := size - t.metadata.flushOffset 436 if t.readonly { 437 return fmt.Errorf("index file(path: %s, name: %s) contains %d garbage data bytes", t.path, t.name, extraSize) 438 } 439 t.logger.Warn("Truncating freezer items after flushOffset", "size", extraSize) 440 return truncateFreezerFile(t.index, t.metadata.flushOffset) 441 442 default: // size < flushOffset 443 // Flush offset refers to a position larger than index file. The only 444 // possible scenario for this is: a power failure or system crash has occurred after 445 // truncating the segment in index file from head or tail, but without updating 446 // the flush offset. In this case, automatically reset the flush offset with 447 // the file size which implies the entire index file is complete. 448 if t.readonly { 449 return nil // do nothing in read only mode 450 } 451 t.logger.Warn("Rewinding freezer flushOffset", "old", t.metadata.flushOffset, "new", size) 452 return t.metadata.setFlushOffset(size, true) 453 } 454 } 455 456 // checkIndex validates the integrity of the index file. According to the design, 457 // the initial entry in the file denotes the earliest data file along with the 458 // count of deleted items. Following this, all subsequent entries in the file must 459 // be in order. This function identifies any corrupted entries and truncates items 460 // occurring after the corruption point. 461 // 462 // corruption can occur because of the power failure. In the Linux kernel, the 463 // file metadata update and data update are not necessarily performed at the 464 // same time. Typically, the metadata will be flushed/journalled ahead of the file 465 // data. Therefore, we make the pessimistic assumption that the file is first 466 // extended with invalid "garbage" data (normally zero bytes) and that afterwards 467 // the correct data replaces the garbage. As all the items in index file are 468 // supposed to be in-order, the leftover garbage must be truncated before the 469 // index data is utilized. 470 // 471 // It's important to note an exception that's unfortunately undetectable: when 472 // all index entries in the file are zero. Distinguishing whether they represent 473 // leftover garbage or if all items in the table have zero size is impossible. 474 // In such instances, the file will remain unchanged to prevent potential data 475 // loss or misinterpretation. 476 func (t *freezerTable) checkIndex(size int64) (int64, error) { 477 // Move the read cursor to the beginning of the file 478 _, err := t.index.Seek(0, io.SeekStart) 479 if err != nil { 480 return 0, err 481 } 482 fr := bufio.NewReader(t.index) 483 484 var ( 485 start = time.Now() 486 buff = make([]byte, indexEntrySize) 487 prev indexEntry 488 head indexEntry 489 490 read = func() (indexEntry, error) { 491 n, err := io.ReadFull(fr, buff) 492 if err != nil { 493 return indexEntry{}, err 494 } 495 if n != indexEntrySize { 496 return indexEntry{}, fmt.Errorf("failed to read from index, n: %d", n) 497 } 498 var entry indexEntry 499 entry.unmarshalBinary(buff) 500 return entry, nil 501 } 502 truncate = func(offset int64) (int64, error) { 503 if t.readonly { 504 return 0, fmt.Errorf("index file is corrupted at %d, size: %d", offset, size) 505 } 506 if err := truncateFreezerFile(t.index, offset); err != nil { 507 return 0, err 508 } 509 log.Warn("Truncated index file", "offset", offset, "truncated", size-offset) 510 return offset, nil 511 } 512 ) 513 for offset := int64(0); offset < size; offset += indexEntrySize { 514 entry, err := read() 515 if err != nil { 516 return 0, err 517 } 518 if offset == 0 { 519 head = entry 520 continue 521 } 522 // Ensure that the first non-head index refers to the earliest file, 523 // or the next file if the earliest file has no space to place the 524 // first item. 525 if offset == indexEntrySize { 526 if entry.filenum != head.filenum && entry.filenum != head.filenum+1 { 527 log.Error("Corrupted index item detected", "earliest", head.filenum, "filenumber", entry.filenum) 528 return truncate(offset) 529 } 530 prev = entry 531 continue 532 } 533 // ensure two consecutive index items are in order 534 if err := t.checkIndexItems(prev, entry); err != nil { 535 log.Error("Corrupted index item detected", "err", err) 536 return truncate(offset) 537 } 538 prev = entry 539 } 540 // Move the read cursor to the end of the file. While theoretically, the 541 // cursor should reach the end by reading all the items in the file, perform 542 // the seek operation anyway as a precaution. 543 _, err = t.index.Seek(0, io.SeekEnd) 544 if err != nil { 545 return 0, err 546 } 547 log.Debug("Verified index file", "items", size/indexEntrySize, "elapsed", common.PrettyDuration(time.Since(start))) 548 return size, nil 549 } 550 551 // checkIndexItems validates the correctness of two consecutive index items based 552 // on the following rules: 553 // 554 // - The file number of two consecutive index items must either be the same or 555 // increase monotonically. If the file number decreases or skips in a 556 // non-sequential manner, the index item is considered invalid. 557 // 558 // - For index items with the same file number, the data offset must be in 559 // non-decreasing order. Note: Two index items with the same file number 560 // and the same data offset are permitted if the entry size is zero. 561 // 562 // - The first index item in a new data file must not have a zero data offset. 563 func (t *freezerTable) checkIndexItems(a, b indexEntry) error { 564 if b.filenum != a.filenum && b.filenum != a.filenum+1 { 565 return fmt.Errorf("index items with inconsistent file number, prev: %d, next: %d", a.filenum, b.filenum) 566 } 567 if b.filenum == a.filenum && b.offset < a.offset { 568 return fmt.Errorf("index items with unordered offset, prev: %d, next: %d", a.offset, b.offset) 569 } 570 if b.filenum == a.filenum+1 && b.offset == 0 { 571 return fmt.Errorf("index items with zero offset, file number: %d", b.filenum) 572 } 573 return nil 574 } 575 576 // preopen opens all files that the freezer will need. This method should be called from an init-context, 577 // since it assumes that it doesn't have to bother with locking 578 // The rationale for doing preopen is to not have to do it from within Retrieve, thus not needing to ever 579 // obtain a write-lock within Retrieve. 580 func (t *freezerTable) preopen() (err error) { 581 // The repair might have already opened (some) files 582 t.releaseFilesAfter(0, false) 583 584 // Open all except head in RDONLY 585 for i := t.tailId; i < t.headId; i++ { 586 if _, err = t.openFile(i, openFreezerFileForReadOnly); err != nil { 587 return err 588 } 589 } 590 if t.readonly { 591 t.head, err = t.openFile(t.headId, openFreezerFileForReadOnly) 592 } else { 593 // Open head in read/write 594 t.head, err = t.openFile(t.headId, openFreezerFileForAppend) 595 } 596 return err 597 } 598 599 // truncateHead discards any recent data above the provided threshold number. 600 func (t *freezerTable) truncateHead(items uint64) error { 601 t.lock.Lock() 602 defer t.lock.Unlock() 603 604 // Ensure the given truncate target falls in the correct range 605 existing := t.items.Load() 606 if existing <= items { 607 return nil 608 } 609 if items < t.itemHidden.Load() { 610 return errors.New("truncation below tail") 611 } 612 // We need to truncate, save the old size for metrics tracking 613 oldSize, err := t.sizeNolock() 614 if err != nil { 615 return err 616 } 617 // Something's out of sync, truncate the table's offset index 618 log := t.logger.Debug 619 if existing > items+1 { 620 log = t.logger.Warn // Only loud warn if we delete multiple items 621 } 622 log("Truncating freezer table", "items", existing, "limit", items) 623 624 // Truncate the index file first, the tail position is also considered 625 // when calculating the new freezer table length. 626 length := items - t.itemOffset.Load() 627 newOffset := (length + 1) * indexEntrySize 628 if err := truncateFreezerFile(t.index, int64(newOffset)); err != nil { 629 return err 630 } 631 if err := t.index.Sync(); err != nil { 632 return err 633 } 634 // If the index file is truncated beyond the flush offset, move the flush 635 // offset back to the new end of the file. A crash may occur before the 636 // offset is updated, leaving a dangling reference that points to a position 637 // outside the file. If so, the offset will be reset to the new end of the 638 // file during the next run. 639 if t.metadata.flushOffset > int64(newOffset) { 640 if err := t.metadata.setFlushOffset(int64(newOffset), true); err != nil { 641 return err 642 } 643 } 644 // Calculate the new expected size of the data file and truncate it 645 var expected indexEntry 646 if length == 0 { 647 expected = indexEntry{filenum: t.tailId, offset: 0} 648 } else { 649 buffer := make([]byte, indexEntrySize) 650 if _, err := t.index.ReadAt(buffer, int64(length*indexEntrySize)); err != nil { 651 return err 652 } 653 expected.unmarshalBinary(buffer) 654 } 655 // We might need to truncate back to older files 656 if expected.filenum != t.headId { 657 // If already open for reading, force-reopen for writing 658 t.releaseFile(expected.filenum) 659 newHead, err := t.openFile(expected.filenum, openFreezerFileForAppend) 660 if err != nil { 661 return err 662 } 663 // Release any files _after the current head -- both the previous head 664 // and any files which may have been opened for reading 665 t.releaseFilesAfter(expected.filenum, true) 666 667 // Set back the historic head 668 t.head = newHead 669 t.headId = expected.filenum 670 } 671 if err := truncateFreezerFile(t.head, int64(expected.offset)); err != nil { 672 return err 673 } 674 if err := t.head.Sync(); err != nil { 675 return err 676 } 677 // All data files truncated, set internal counters and return 678 t.headBytes = int64(expected.offset) 679 t.items.Store(items) 680 681 // Retrieve the new size and update the total size counter 682 newSize, err := t.sizeNolock() 683 if err != nil { 684 return err 685 } 686 t.sizeGauge.Dec(int64(oldSize - newSize)) 687 return nil 688 } 689 690 // sizeHidden returns the total data size of hidden items in the freezer table. 691 // This function assumes the lock is already held. 692 func (t *freezerTable) sizeHidden() (uint64, error) { 693 hidden, offset := t.itemHidden.Load(), t.itemOffset.Load() 694 if hidden <= offset { 695 return 0, nil 696 } 697 indices, err := t.getIndices(hidden-1, 1) 698 if err != nil { 699 return 0, err 700 } 701 return uint64(indices[1].offset), nil 702 } 703 704 // truncateTail discards any recent data before the provided threshold number. 705 func (t *freezerTable) truncateTail(items uint64) error { 706 t.lock.Lock() 707 defer t.lock.Unlock() 708 709 // Ensure the given truncate target falls in the correct range 710 if t.itemHidden.Load() >= items { 711 return nil 712 } 713 if t.items.Load() < items { 714 return errors.New("truncation above head") 715 } 716 // Load the new tail index by the given new tail position 717 var ( 718 newTailId uint32 719 buffer = make([]byte, indexEntrySize) 720 ) 721 if t.items.Load() == items { 722 newTailId = t.headId 723 } else { 724 offset := items - t.itemOffset.Load() 725 if _, err := t.index.ReadAt(buffer, int64((offset+1)*indexEntrySize)); err != nil { 726 return err 727 } 728 var newTail indexEntry 729 newTail.unmarshalBinary(buffer) 730 newTailId = newTail.filenum 731 } 732 // Save the old size for metrics tracking. This needs to be done 733 // before any updates to either itemHidden or itemOffset. 734 oldSize, err := t.sizeNolock() 735 if err != nil { 736 return err 737 } 738 // Update the virtual tail marker and hidden these entries in table. 739 t.itemHidden.Store(items) 740 741 // Update the virtual tail without fsync, otherwise it will significantly 742 // impact the overall performance. 743 if err := t.metadata.setVirtualTail(items, false); err != nil { 744 return err 745 } 746 // Hidden items still fall in the current tail file, no data file 747 // can be dropped. 748 if t.tailId == newTailId { 749 return nil 750 } 751 // Hidden items fall in the incorrect range, returns the error. 752 if t.tailId > newTailId { 753 return fmt.Errorf("invalid index, tail-file %d, item-file %d", t.tailId, newTailId) 754 } 755 // Sync the table before performing the index tail truncation. A crash may 756 // occur after truncating the index file without updating the flush offset, 757 // leaving a dangling offset that points to a position outside the file. 758 // The offset will be rewound to the end of file during the next run 759 // automatically and implicitly assumes all the items within the file are 760 // complete. 761 // 762 // Therefore, forcibly flush everything above the offset to ensure this 763 // assumption is satisfied! 764 if err := t.doSync(); err != nil { 765 return err 766 } 767 // Count how many items can be deleted from the file. 768 var ( 769 newDeleted = items 770 deleted = t.itemOffset.Load() 771 ) 772 // Hidden items exceed the current tail file, drop the relevant data files. 773 for current := items - 1; current >= deleted; current -= 1 { 774 if _, err := t.index.ReadAt(buffer, int64((current-deleted+1)*indexEntrySize)); err != nil { 775 return err 776 } 777 var pre indexEntry 778 pre.unmarshalBinary(buffer) 779 if pre.filenum != newTailId { 780 break 781 } 782 newDeleted = current 783 } 784 // Close the index file before shorten it. 785 if err := t.index.Close(); err != nil { 786 return err 787 } 788 // Truncate the deleted index entries from the index file. 789 err = copyFrom(t.index.Name(), t.index.Name(), indexEntrySize*(newDeleted-deleted+1), func(f *os.File) error { 790 tailIndex := indexEntry{ 791 filenum: newTailId, 792 offset: uint32(newDeleted), 793 } 794 _, err := f.Write(tailIndex.append(nil)) 795 return err 796 }) 797 if err != nil { 798 return err 799 } 800 // Reopen the modified index file to load the changes 801 t.index, err = openFreezerFileForAppend(t.index.Name()) 802 if err != nil { 803 return err 804 } 805 // Sync the file to ensure changes are flushed to disk 806 if err := t.index.Sync(); err != nil { 807 return err 808 } 809 // Release any files before the current tail 810 t.tailId = newTailId 811 t.itemOffset.Store(newDeleted) 812 t.releaseFilesBefore(t.tailId, true) 813 814 // Move the index flush offset backward due to the deletion of an index segment. 815 // A crash may occur before the offset is updated, leaving a dangling reference 816 // that points to a position outside the file. If so, the offset will be reset 817 // to the new end of the file during the next run. 818 // 819 // Note, both the index and head data file has been persisted before performing 820 // tail truncation and all the items in these files are regarded as complete. 821 shorten := indexEntrySize * int64(newDeleted-deleted) 822 if t.metadata.flushOffset <= shorten { 823 return fmt.Errorf("invalid index flush offset: %d, shorten: %d", t.metadata.flushOffset, shorten) 824 } else { 825 if err := t.metadata.setFlushOffset(t.metadata.flushOffset-shorten, true); err != nil { 826 return err 827 } 828 } 829 // Retrieve the new size and update the total size counter 830 newSize, err := t.sizeNolock() 831 if err != nil { 832 return err 833 } 834 t.sizeGauge.Dec(int64(oldSize - newSize)) 835 return nil 836 } 837 838 // Close closes all opened files and finalizes the freezer table for use. 839 // This operation must be completed before shutdown to prevent the loss of 840 // recent writes. 841 func (t *freezerTable) Close() error { 842 t.lock.Lock() 843 defer t.lock.Unlock() 844 845 if err := t.doSync(); err != nil { 846 return err 847 } 848 var errs []error 849 doClose := func(f *os.File) { 850 if err := f.Close(); err != nil { 851 errs = append(errs, err) 852 } 853 } 854 doClose(t.index) 855 doClose(t.metadata.file) 856 for _, f := range t.files { 857 doClose(f) 858 } 859 t.index = nil 860 t.head = nil 861 t.metadata.file = nil 862 863 if errs != nil { 864 return fmt.Errorf("%v", errs) 865 } 866 return nil 867 } 868 869 // openFile assumes that the write-lock is held by the caller 870 func (t *freezerTable) openFile(num uint32, opener func(string) (*os.File, error)) (f *os.File, err error) { 871 var exist bool 872 if f, exist = t.files[num]; !exist { 873 var name string 874 if t.config.noSnappy { 875 name = fmt.Sprintf("%s.%04d.rdat", t.name, num) 876 } else { 877 name = fmt.Sprintf("%s.%04d.cdat", t.name, num) 878 } 879 f, err = opener(filepath.Join(t.path, name)) 880 if err != nil { 881 return nil, err 882 } 883 t.files[num] = f 884 } 885 return f, err 886 } 887 888 // releaseFile closes a file, and removes it from the open file cache. 889 // Assumes that the caller holds the write lock 890 func (t *freezerTable) releaseFile(num uint32) { 891 if f, exist := t.files[num]; exist { 892 delete(t.files, num) 893 f.Close() 894 } 895 } 896 897 // releaseFilesAfter closes all open files with a higher number, and optionally also deletes the files 898 func (t *freezerTable) releaseFilesAfter(num uint32, remove bool) { 899 for fnum, f := range t.files { 900 if fnum > num { 901 delete(t.files, fnum) 902 f.Close() 903 if remove { 904 os.Remove(f.Name()) 905 } 906 } 907 } 908 } 909 910 // releaseFilesBefore closes all open files with a lower number, and optionally also deletes the files 911 func (t *freezerTable) releaseFilesBefore(num uint32, remove bool) { 912 for fnum, f := range t.files { 913 if fnum < num { 914 delete(t.files, fnum) 915 f.Close() 916 if remove { 917 os.Remove(f.Name()) 918 } 919 } 920 } 921 } 922 923 // getIndices returns the index entries for the given from-item, covering 'count' items. 924 // N.B: The actual number of returned indices for N items will always be N+1 (unless an 925 // error is returned). 926 // OBS: This method assumes that the caller has already verified (and/or trimmed) the range 927 // so that the items are within bounds. If this method is used to read out of bounds, 928 // it will return error. 929 func (t *freezerTable) getIndices(from, count uint64) ([]*indexEntry, error) { 930 // Apply the table-offset 931 from = from - t.itemOffset.Load() 932 933 // For reading N items, we need N+1 indices. 934 buffer := make([]byte, (count+1)*indexEntrySize) 935 if _, err := t.index.ReadAt(buffer, int64(from*indexEntrySize)); err != nil { 936 return nil, err 937 } 938 var ( 939 indices []*indexEntry 940 offset int 941 ) 942 for i := from; i <= from+count; i++ { 943 index := new(indexEntry) 944 index.unmarshalBinary(buffer[offset:]) 945 offset += indexEntrySize 946 indices = append(indices, index) 947 } 948 if from == 0 { 949 // Special case if we're reading the first item in the freezer. We assume that 950 // the first item always start from zero(regarding the deletion, we 951 // only support deletion by files, so that the assumption is held). 952 // This means we can use the first item metadata to carry information about 953 // the 'global' offset, for the deletion-case 954 indices[0].offset = 0 955 indices[0].filenum = indices[1].filenum 956 } 957 return indices, nil 958 } 959 960 // Retrieve looks up the data offset of an item with the given number and retrieves 961 // the raw binary blob from the data file. 962 func (t *freezerTable) Retrieve(item uint64) ([]byte, error) { 963 items, err := t.RetrieveItems(item, 1, 0) 964 if err != nil { 965 return nil, err 966 } 967 return items[0], nil 968 } 969 970 // RetrieveItems returns multiple items in sequence, starting from the index 'start'. 971 // It will return at most 'max' items, but will abort earlier to respect the 972 // 'maxBytes' argument. However, if the 'maxBytes' is smaller than the size of one 973 // item, it _will_ return one element and possibly overflow the maxBytes. 974 func (t *freezerTable) RetrieveItems(start, count, maxBytes uint64) ([][]byte, error) { 975 // First we read the 'raw' data, which might be compressed. 976 diskData, sizes, err := t.retrieveItems(start, count, maxBytes) 977 if err != nil { 978 return nil, err 979 } 980 var ( 981 output = make([][]byte, 0, count) 982 offset int // offset for reading 983 outputSize int // size of uncompressed data 984 ) 985 // Now slice up the data and decompress. 986 for i, diskSize := range sizes { 987 item := diskData[offset : offset+diskSize] 988 offset += diskSize 989 decompressedSize := diskSize 990 if !t.config.noSnappy { 991 decompressedSize, _ = snappy.DecodedLen(item) 992 } 993 if i > 0 && maxBytes != 0 && uint64(outputSize+decompressedSize) > maxBytes { 994 break 995 } 996 if !t.config.noSnappy { 997 data, err := snappy.Decode(nil, item) 998 if err != nil { 999 return nil, err 1000 } 1001 output = append(output, data) 1002 } else { 1003 output = append(output, item) 1004 } 1005 outputSize += decompressedSize 1006 } 1007 return output, nil 1008 } 1009 1010 // retrieveItems reads up to 'count' items from the table. It reads at least 1011 // one item, but otherwise avoids reading more than maxBytes bytes. Freezer 1012 // will ignore the size limitation and continuously allocate memory to store 1013 // data if maxBytes is 0. It returns the (potentially compressed) data, and 1014 // the sizes. 1015 func (t *freezerTable) retrieveItems(start, count, maxBytes uint64) ([]byte, []int, error) { 1016 t.lock.RLock() 1017 defer t.lock.RUnlock() 1018 1019 // Ensure the table and the item are accessible 1020 if t.index == nil || t.head == nil || t.metadata.file == nil { 1021 return nil, nil, errClosed 1022 } 1023 var ( 1024 items = t.items.Load() // the total items(head + 1) 1025 hidden = t.itemHidden.Load() // the number of hidden items 1026 ) 1027 // Ensure the start is written, not deleted from the tail, and that the 1028 // caller actually wants something 1029 if items <= start || hidden > start || count == 0 { 1030 return nil, nil, errOutOfBounds 1031 } 1032 if start+count > items { 1033 count = items - start 1034 } 1035 var output []byte // Buffer to read data into 1036 if maxBytes != 0 { 1037 output = make([]byte, 0, maxBytes) 1038 } else { 1039 output = make([]byte, 0, 1024) // initial buffer cap 1040 } 1041 // readData is a helper method to read a single data item from disk. 1042 readData := func(fileId, start uint32, length int) error { 1043 output = grow(output, length) 1044 dataFile, exist := t.files[fileId] 1045 if !exist { 1046 return fmt.Errorf("missing data file %d", fileId) 1047 } 1048 if _, err := dataFile.ReadAt(output[len(output)-length:], int64(start)); err != nil { 1049 return fmt.Errorf("%w, fileid: %d, start: %d, length: %d", err, fileId, start, length) 1050 } 1051 return nil 1052 } 1053 // Read all the indexes in one go 1054 indices, err := t.getIndices(start, count) 1055 if err != nil { 1056 return nil, nil, err 1057 } 1058 var ( 1059 sizes []int // The sizes for each element 1060 totalSize = 0 // The total size of all data read so far 1061 readStart = indices[0].offset // Where, in the file, to start reading 1062 unreadSize = 0 // The size of the as-yet-unread data 1063 ) 1064 1065 for i, firstIndex := range indices[:len(indices)-1] { 1066 secondIndex := indices[i+1] 1067 // Determine the size of the item. 1068 offset1, offset2, _ := firstIndex.bounds(secondIndex) 1069 size := int(offset2 - offset1) 1070 // Crossing a file boundary? 1071 if secondIndex.filenum != firstIndex.filenum { 1072 // If we have unread data in the first file, we need to do that read now. 1073 if unreadSize > 0 { 1074 if err := readData(firstIndex.filenum, readStart, unreadSize); err != nil { 1075 return nil, nil, err 1076 } 1077 unreadSize = 0 1078 } 1079 readStart = 0 1080 } 1081 if i > 0 && uint64(totalSize+size) > maxBytes && maxBytes != 0 { 1082 // About to break out due to byte limit being exceeded. We don't 1083 // read this last item, but we need to do the deferred reads now. 1084 if unreadSize > 0 { 1085 if err := readData(secondIndex.filenum, readStart, unreadSize); err != nil { 1086 return nil, nil, err 1087 } 1088 } 1089 break 1090 } 1091 // Defer the read for later 1092 unreadSize += size 1093 totalSize += size 1094 sizes = append(sizes, size) 1095 if i == len(indices)-2 || (uint64(totalSize) > maxBytes && maxBytes != 0) { 1096 // Last item, need to do the read now 1097 if err := readData(secondIndex.filenum, readStart, unreadSize); err != nil { 1098 return nil, nil, err 1099 } 1100 break 1101 } 1102 } 1103 1104 // Update metrics. 1105 t.readMeter.Mark(int64(totalSize)) 1106 return output, sizes, nil 1107 } 1108 1109 // size returns the total data size in the freezer table. 1110 func (t *freezerTable) size() (uint64, error) { 1111 t.lock.RLock() 1112 defer t.lock.RUnlock() 1113 1114 return t.sizeNolock() 1115 } 1116 1117 // sizeNolock returns the total data size in the freezer table. This function 1118 // assumes the lock is already held. 1119 func (t *freezerTable) sizeNolock() (uint64, error) { 1120 stat, err := t.index.Stat() 1121 if err != nil { 1122 return 0, err 1123 } 1124 hidden, err := t.sizeHidden() 1125 if err != nil { 1126 return 0, err 1127 } 1128 total := uint64(t.maxFileSize)*uint64(t.headId-t.tailId) + uint64(t.headBytes) + uint64(stat.Size()) - hidden 1129 return total, nil 1130 } 1131 1132 // advanceHead should be called when the current head file would outgrow the file limits, 1133 // and a new file must be opened. The caller of this method must hold the write-lock 1134 // before calling this method. 1135 func (t *freezerTable) advanceHead() error { 1136 t.lock.Lock() 1137 defer t.lock.Unlock() 1138 1139 if err := t.doSync(); err != nil { 1140 return err 1141 } 1142 // We open the next file in truncated mode -- if this file already 1143 // exists, we need to start over from scratch on it. 1144 nextID := t.headId + 1 1145 newHead, err := t.openFile(nextID, openFreezerFileTruncated) 1146 if err != nil { 1147 return err 1148 } 1149 // Commit the contents of the old file to stable storage and 1150 // tear it down. It will be re-opened in read-only mode. 1151 if err := t.head.Sync(); err != nil { 1152 return err 1153 } 1154 t.releaseFile(t.headId) 1155 t.openFile(t.headId, openFreezerFileForReadOnly) 1156 1157 // Swap out the current head. 1158 t.head = newHead 1159 t.headBytes = 0 1160 t.headId = nextID 1161 return nil 1162 } 1163 1164 // Sync pushes any pending data from memory out to disk. This is an expensive 1165 // operation, so use it with care. 1166 func (t *freezerTable) Sync() error { 1167 t.lock.Lock() 1168 defer t.lock.Unlock() 1169 1170 return t.doSync() 1171 } 1172 1173 // doSync is the internal version of Sync which assumes the lock is already held. 1174 func (t *freezerTable) doSync() error { 1175 // Trying to fsync a file opened in rdonly causes "Access denied" 1176 // error on Windows. 1177 if t.readonly { 1178 return nil 1179 } 1180 if t.index == nil || t.head == nil || t.metadata.file == nil { 1181 return errClosed 1182 } 1183 var err error 1184 trackError := func(e error) { 1185 if e != nil && err == nil { 1186 err = e 1187 } 1188 } 1189 trackError(t.index.Sync()) 1190 trackError(t.head.Sync()) 1191 1192 // A crash may occur before the offset is updated, leaving the offset 1193 // points to a old position. If so, the extra items above the offset 1194 // will be truncated during the next run. 1195 stat, err := t.index.Stat() 1196 if err != nil { 1197 return err 1198 } 1199 offset := stat.Size() 1200 trackError(t.metadata.setFlushOffset(offset, true)) 1201 return err 1202 } 1203 1204 func (t *freezerTable) dumpIndexStdout(start, stop int64) { 1205 t.dumpIndex(os.Stdout, start, stop) 1206 } 1207 1208 func (t *freezerTable) dumpIndexString(start, stop int64) string { 1209 var out bytes.Buffer 1210 out.WriteString("\n") 1211 t.dumpIndex(&out, start, stop) 1212 return out.String() 1213 } 1214 1215 func (t *freezerTable) dumpIndex(w io.Writer, start, stop int64) { 1216 fmt.Fprintf(w, "Version %d count %d, deleted %d, hidden %d\n", 1217 t.metadata.version, t.items.Load(), t.itemOffset.Load(), t.itemHidden.Load()) 1218 1219 buf := make([]byte, indexEntrySize) 1220 1221 fmt.Fprintf(w, "| number | fileno | offset |\n") 1222 fmt.Fprintf(w, "|--------|--------|--------|\n") 1223 1224 for i := uint64(start); ; i++ { 1225 if _, err := t.index.ReadAt(buf, int64((i+1)*indexEntrySize)); err != nil { 1226 break 1227 } 1228 var entry indexEntry 1229 entry.unmarshalBinary(buf) 1230 fmt.Fprintf(w, "| %03d | %03d | %03d | \n", i, entry.filenum, entry.offset) 1231 if stop > 0 && i >= uint64(stop) { 1232 break 1233 } 1234 } 1235 fmt.Fprintf(w, "|--------------------------|\n") 1236 }