github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/journal_writer.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bufio" 19 "context" 20 "errors" 21 "fmt" 22 "hash/crc32" 23 "io" 24 "os" 25 "path/filepath" 26 "sync" 27 28 "github.com/dolthub/swiss" 29 "github.com/sirupsen/logrus" 30 "golang.org/x/sync/errgroup" 31 32 "github.com/dolthub/dolt/go/store/chunks" 33 "github.com/dolthub/dolt/go/store/hash" 34 ) 35 36 const ( 37 chunkJournalFileSize = 16 * 1024 38 39 // todo(andy): buffer must be able to hold an entire record, 40 // but we don't have a hard limit on record size right now 41 journalWriterBuffSize = 1024 * 1024 42 43 chunkJournalAddr = chunks.JournalFileID 44 45 journalIndexFileName = "journal.idx" 46 47 // journalIndexDefaultMaxNovel determines how often we flush 48 // records qto the out-of-band journal index file. 49 journalIndexDefaultMaxNovel = 16384 50 51 // journalMaybeSyncThreshold determines how much un-syncd written data 52 // can be outstanding to the journal before we will sync it. 53 journalMaybeSyncThreshold = 64 * 1024 * 1024 54 ) 55 56 var ( 57 journalAddr = hash.Parse(chunkJournalAddr) 58 ) 59 60 func isJournalAddr(h hash.Hash) bool { 61 return h == journalAddr 62 } 63 64 func fileExists(path string) (bool, error) { 65 var err error 66 if path, err = filepath.Abs(path); err != nil { 67 return false, err 68 } 69 70 info, err := os.Stat(path) 71 if errors.Is(err, os.ErrNotExist) { 72 return false, nil 73 } else if info.IsDir() { 74 return true, fmt.Errorf("expected file %s, found directory", path) 75 } 76 return true, nil 77 } 78 79 func openJournalWriter(ctx context.Context, path string) (wr *journalWriter, exists bool, err error) { 80 var f *os.File 81 if path, err = filepath.Abs(path); err != nil { 82 return nil, false, err 83 } 84 85 info, err := os.Stat(path) 86 if errors.Is(err, os.ErrNotExist) { 87 return nil, false, nil 88 } else if err != nil { 89 return nil, false, err 90 } else if info.IsDir() { 91 return nil, true, fmt.Errorf("expected file %s found directory", chunkJournalName) 92 } 93 if f, err = os.OpenFile(path, os.O_RDWR, 0666); err != nil { 94 return nil, true, err 95 } 96 97 return &journalWriter{ 98 buf: make([]byte, 0, journalWriterBuffSize), 99 journal: f, 100 path: path, 101 }, true, nil 102 } 103 104 func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, err error) { 105 var f *os.File 106 if path, err = filepath.Abs(path); err != nil { 107 return nil, err 108 } 109 110 _, err = os.Stat(path) 111 if err == nil { 112 return nil, fmt.Errorf("journal file %s already exists", chunkJournalName) 113 } else if !errors.Is(err, os.ErrNotExist) { 114 return nil, err 115 } 116 117 if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0666); err != nil { 118 return nil, err 119 } 120 const batch = 1024 * 1024 121 b := make([]byte, batch) 122 for i := 0; i < chunkJournalFileSize; i += batch { 123 if _, err = f.Write(b); err != nil { // zero fill |f| 124 return nil, err 125 } 126 } 127 if err = f.Sync(); err != nil { 128 return nil, err 129 } 130 if o, err := f.Seek(0, io.SeekStart); err != nil { 131 return nil, err 132 } else if o != 0 { 133 return nil, fmt.Errorf("expected file journalOffset 0, got %d", o) 134 } 135 136 return &journalWriter{ 137 buf: make([]byte, 0, journalWriterBuffSize), 138 journal: f, 139 path: path, 140 }, nil 141 } 142 143 func deleteJournalAndIndexFiles(ctx context.Context, path string) (err error) { 144 if err = os.Remove(path); err != nil { 145 return err 146 } 147 idxPath := filepath.Join(filepath.Dir(path), journalIndexFileName) 148 return os.Remove(idxPath) 149 } 150 151 type journalWriter struct { 152 buf []byte 153 154 journal *os.File 155 // off indicates the last position that has been written to the journal buffer 156 off int64 157 indexed int64 158 path string 159 uncmpSz uint64 160 161 unsyncd uint64 162 currentRoot hash.Hash 163 164 ranges rangeIndex 165 index *os.File 166 indexWriter *bufio.Writer 167 batchCrc uint32 168 maxNovel int 169 170 lock sync.RWMutex 171 } 172 173 var _ io.Closer = &journalWriter{} 174 175 // bootstrapJournal reads in records from the journal file and the journal index file, initializing 176 // the state of the journalWriter. Root hashes read from root update records in the journal are written 177 // to |reflogRingBuffer|, which maintains the most recently updated roots which are used to generate the 178 // reflog. This function returns the most recent root hash for the journal as well as any error encountered. 179 // The journal index will bw truncated to the last valid batch of lookups. Lookups with offsets 180 // larger than the position of the last valid lookup metadata are rewritten to the index as they 181 // are added to the novel ranges map. If the number of novel lookups exceeds |wr.maxNovel|, we 182 // extend the jounral index with one metadata flush before existing this function to save indexing 183 // progress. 184 func (wr *journalWriter) bootstrapJournal(ctx context.Context, reflogRingBuffer *reflogRingBuffer) (last hash.Hash, err error) { 185 wr.lock.Lock() 186 defer wr.lock.Unlock() 187 188 if wr.maxNovel == 0 { 189 wr.maxNovel = journalIndexDefaultMaxNovel 190 } 191 wr.ranges = newRangeIndex() 192 193 p := filepath.Join(filepath.Dir(wr.path), journalIndexFileName) 194 var ok bool 195 ok, err = fileExists(p) 196 if err != nil { 197 return 198 } else if ok { 199 wr.index, err = os.OpenFile(p, os.O_RDWR, 0666) 200 } else { 201 wr.index, err = os.OpenFile(p, os.O_RDWR|os.O_CREATE, 0666) 202 } 203 if err != nil { 204 return 205 } 206 wr.indexWriter = bufio.NewWriterSize(wr.index, journalIndexDefaultMaxNovel) 207 208 if ok { 209 var info os.FileInfo 210 if info, err = wr.index.Stat(); err != nil { 211 return hash.Hash{}, err 212 } 213 214 // initialize range index with enough capacity to 215 // avoid rehashing during bootstrapping 216 cnt := estimateRangeCount(info) 217 wr.ranges.cached = swiss.NewMap[addr16, Range](cnt) 218 219 eg, ectx := errgroup.WithContext(ctx) 220 ch := make(chan []lookup, 4) 221 222 // process the indexed portion of the journal 223 var safeIndexOffset int64 224 var prev int64 225 226 eg.Go(func() error { 227 defer close(ch) 228 safeIndexOffset, err = processIndexRecords(bufio.NewReader(wr.index), info.Size(), func(m lookupMeta, batch []lookup, batchChecksum uint32) error { 229 if m.checkSum != batchChecksum { 230 return fmt.Errorf("invalid index checksum (%d != %d)", batchChecksum, m.checkSum) 231 } 232 233 if m.batchStart != prev { 234 return fmt.Errorf("index records do not cover contiguous region (%d != %d)", m.batchStart, prev) 235 } 236 prev = m.batchEnd 237 238 // |r.end| is expected to point to a root hash record in |wr.journal| 239 // containing a hash equal to |r.lastRoot|, validate this here 240 if h, err := peekRootHashAt(wr.journal, int64(m.batchEnd)); err != nil { 241 return err 242 } else if h != m.latestHash { 243 return fmt.Errorf("invalid index record hash (%s != %s)", h.String(), m.latestHash.String()) 244 } 245 246 select { 247 case <-ectx.Done(): 248 return ectx.Err() 249 case ch <- batch: 250 // record a high-water-mark for the indexed portion of the journal 251 wr.indexed = int64(m.batchEnd) 252 } 253 return nil 254 }) 255 return err 256 }) 257 // populate range hashmap 258 eg.Go(func() error { 259 for { 260 select { 261 case <-ectx.Done(): 262 return nil 263 case ll, ok := <-ch: 264 if !ok { 265 return nil 266 } 267 for _, l := range ll { 268 wr.ranges.putCached(l.a, l.r) 269 } 270 } 271 } 272 }) 273 274 err = eg.Wait() 275 if err != nil { 276 err = fmt.Errorf("error bootstrapping chunk journal: %s", err.Error()) 277 if cerr := wr.corruptIndexRecovery(); cerr != nil { 278 err = fmt.Errorf("error recovering corrupted chunk journal index: %s", err.Error()) 279 } 280 return hash.Hash{}, err 281 } 282 283 // rewind index to last safe point. Note that |safeIndexOffset| refers 284 // to a location in the index file, while |wr.indexed| refers to a position 285 // in the journal file. 286 if err := wr.truncateIndex(safeIndexOffset); err != nil { 287 return hash.Hash{}, err 288 } 289 wr.ranges = wr.ranges.flatten() 290 } 291 292 var lastOffset int64 293 294 // process the non-indexed portion of the journal starting at |wr.indexed|, 295 // at minimum the non-indexed portion will include a root hash record. 296 // Index lookups are added to the ongoing batch to re-synchronize. 297 wr.off, err = processJournalRecords(ctx, wr.journal, wr.indexed, func(o int64, r journalRec) error { 298 switch r.kind { 299 case chunkJournalRecKind: 300 rng := Range{ 301 Offset: uint64(o) + uint64(r.payloadOffset()), 302 Length: uint32(len(r.payload)), 303 } 304 wr.ranges.put(r.address, rng) 305 wr.uncmpSz += r.uncompressedPayloadSize() 306 307 a := toAddr16(r.address) 308 if err := writeIndexLookup(wr.indexWriter, lookup{a: a, r: rng}); err != nil { 309 return err 310 } 311 wr.batchCrc = crc32.Update(wr.batchCrc, crcTable, a[:]) 312 313 case rootHashJournalRecKind: 314 lastOffset = o 315 last = hash.Hash(r.address) 316 if !reflogDisabled && reflogRingBuffer != nil { 317 reflogRingBuffer.Push(reflogRootHashEntry{ 318 root: r.address.String(), 319 timestamp: r.timestamp, 320 }) 321 } 322 323 default: 324 return fmt.Errorf("unknown journal record kind (%d)", r.kind) 325 } 326 return nil 327 }) 328 if err != nil { 329 return hash.Hash{}, err 330 } 331 332 if wr.ranges.novelCount() > wr.maxNovel { 333 // save bootstrap progress 334 if err := wr.flushIndexRecord(last, lastOffset); err != nil { 335 return hash.Hash{}, err 336 } 337 } 338 339 wr.currentRoot = last 340 341 return 342 } 343 344 // corruptIndexRecovery handles a corrupted or malformed journal index by truncating 345 // the index file and restarting the journal bootstrapping process without an index. 346 // todo: make backup file? 347 func (wr *journalWriter) corruptIndexRecovery() error { 348 if err := wr.truncateIndex(0); err != nil { 349 return err 350 } 351 // reset bootstrapping state 352 wr.off, wr.indexed, wr.uncmpSz = 0, 0, 0 353 wr.ranges = newRangeIndex() 354 return nil 355 } 356 357 func (wr *journalWriter) truncateIndex(off int64) error { 358 if _, err := wr.index.Seek(off, io.SeekStart); err != nil { 359 return err 360 } 361 if err := wr.index.Truncate(off); err != nil { 362 return err 363 } 364 return nil 365 } 366 367 // hasAddr returns true if the journal contains a chunk with addr |h|. 368 func (wr *journalWriter) hasAddr(h hash.Hash) (ok bool) { 369 wr.lock.RLock() 370 defer wr.lock.RUnlock() 371 _, ok = wr.ranges.get(h) 372 return 373 } 374 375 // getCompressedChunk reads the CompressedChunks with addr |h|. 376 func (wr *journalWriter) getCompressedChunk(h hash.Hash) (CompressedChunk, error) { 377 wr.lock.RLock() 378 defer wr.lock.RUnlock() 379 r, ok := wr.ranges.get(h) 380 if !ok { 381 return CompressedChunk{}, nil 382 } 383 buf := make([]byte, r.Length) 384 if _, err := wr.readAt(buf, int64(r.Offset)); err != nil { 385 return CompressedChunk{}, err 386 } 387 return NewCompressedChunk(hash.Hash(h), buf) 388 } 389 390 // getCompressedChunk reads the CompressedChunks with addr |h|. 391 func (wr *journalWriter) getCompressedChunkAtRange(r Range, h hash.Hash) (CompressedChunk, error) { 392 buf := make([]byte, r.Length) 393 if _, err := wr.readAt(buf, int64(r.Offset)); err != nil { 394 return CompressedChunk{}, err 395 } 396 return NewCompressedChunk(hash.Hash(h), buf) 397 } 398 399 // getRange returns a Range for the chunk with addr |h|. 400 func (wr *journalWriter) getRange(h hash.Hash) (rng Range, ok bool, err error) { 401 // callers will use |rng| to read directly from the 402 // journal file, so we must flush here 403 if err = wr.maybeFlush(); err != nil { 404 return 405 } 406 wr.lock.RLock() 407 defer wr.lock.RUnlock() 408 rng, ok = wr.ranges.get(h) 409 return 410 } 411 412 // writeCompressedChunk writes |cc| to the journal. 413 func (wr *journalWriter) writeCompressedChunk(cc CompressedChunk) error { 414 wr.lock.Lock() 415 defer wr.lock.Unlock() 416 recordLen, payloadOff := chunkRecordSize(cc) 417 rng := Range{ 418 Offset: uint64(wr.offset()) + uint64(payloadOff), 419 Length: uint32(len(cc.FullCompressedChunk)), 420 } 421 buf, err := wr.getBytes(int(recordLen)) 422 if err != nil { 423 return err 424 } 425 wr.unsyncd += uint64(recordLen) 426 _ = writeChunkRecord(buf, cc) 427 wr.ranges.put(cc.H, rng) 428 429 a := toAddr16(cc.H) 430 if err := writeIndexLookup(wr.indexWriter, lookup{a: a, r: rng}); err != nil { 431 return err 432 } 433 wr.batchCrc = crc32.Update(wr.batchCrc, crcTable, a[:]) 434 435 // To fulfill our durability guarantees, we technically only need to 436 // file.Sync() the journal when we commit a new root chunk. However, 437 // allowing an unbounded amount of unflushed dirty pages to accumulate 438 // in the OS's page cache makes it possible for small writes which come 439 // along during a large non-committing write to block on flushing all 440 // of the unflushed data. To minimize interference from large 441 // non-committing writes, we cap the amount of unflushed data here. 442 // 443 // We go through |commitRootHash|, instead of directly |Sync()|ing the 444 // file, because we also have accumulating delayed work in the form of 445 // journal index records which may need to be serialized and flushed. 446 // Assumptions in journal bootstraping and the contents of the journal 447 // index require us to have a newly written root hash record anytime we 448 // write index records out. It's perfectly fine to reuse the current 449 // root hash, and this will also take care of the |Sync|. 450 if wr.unsyncd > journalMaybeSyncThreshold && !wr.currentRoot.IsEmpty() { 451 return wr.commitRootHashUnlocked(wr.currentRoot) 452 } 453 454 return nil 455 } 456 457 // commitRootHash commits |root| to the journal and syncs the file to disk. 458 func (wr *journalWriter) commitRootHash(root hash.Hash) error { 459 wr.lock.Lock() 460 defer wr.lock.Unlock() 461 return wr.commitRootHashUnlocked(root) 462 } 463 464 func (wr *journalWriter) commitRootHashUnlocked(root hash.Hash) error { 465 buf, err := wr.getBytes(rootHashRecordSize()) 466 if err != nil { 467 return err 468 } 469 wr.currentRoot = root 470 n := writeRootHashRecord(buf, root) 471 if err = wr.flush(); err != nil { 472 return err 473 } 474 if err = wr.journal.Sync(); err != nil { 475 return err 476 } 477 478 wr.unsyncd = 0 479 if wr.ranges.novelCount() > wr.maxNovel { 480 o := wr.offset() - int64(n) // pre-commit journal offset 481 if err := wr.flushIndexRecord(root, o); err != nil { 482 return err 483 } 484 } 485 return nil 486 } 487 488 // flushIndexRecord writes metadata for a range of index lookups to the 489 // out-of-band journal index file. Index records accelerate journal 490 // bootstrapping by reducing the amount of the journal that must be processed. 491 func (wr *journalWriter) flushIndexRecord(root hash.Hash, end int64) (err error) { 492 if err := writeJournalIndexMeta(wr.indexWriter, root, wr.indexed, end, wr.batchCrc); err != nil { 493 return err 494 } 495 wr.batchCrc = 0 496 wr.ranges = wr.ranges.flatten() 497 // set a new high-water-mark for the indexed portion of the journal 498 wr.indexed = end 499 return 500 } 501 502 // readAt reads len(p) bytes from the journal at offset |off|. 503 func (wr *journalWriter) readAt(p []byte, off int64) (n int, err error) { 504 var bp []byte 505 if off < wr.off { 506 // fill some or all of |p| from |wr.file| 507 fread := int(wr.off - off) 508 if len(p) > fread { 509 // straddled read 510 bp = p[fread:] 511 p = p[:fread] 512 } 513 if n, err = wr.journal.ReadAt(p, off); err != nil { 514 return 0, err 515 } 516 off = 0 517 } else { 518 // fill all of |p| from |wr.buf| 519 bp = p 520 off -= wr.off 521 } 522 n += copy(bp, wr.buf[off:]) 523 return 524 } 525 526 // getBytes returns a buffer for writers to copy data into. 527 func (wr *journalWriter) getBytes(n int) (buf []byte, err error) { 528 c, l := cap(wr.buf), len(wr.buf) 529 if n > c { 530 err = fmt.Errorf("requested bytes (%d) exceeds capacity (%d)", n, c) 531 return 532 } else if n > c-l { 533 if err = wr.flush(); err != nil { 534 return 535 } 536 } 537 l = len(wr.buf) 538 wr.buf = wr.buf[:l+n] 539 buf = wr.buf[l : l+n] 540 return 541 } 542 543 // flush writes buffered data into the journal file. 544 func (wr *journalWriter) flush() (err error) { 545 if _, err = wr.journal.WriteAt(wr.buf, wr.off); err != nil { 546 return err 547 } 548 wr.off += int64(len(wr.buf)) 549 wr.buf = wr.buf[:0] 550 return 551 } 552 553 // maybeFlush flushes buffered data, if any exists. 554 func (wr *journalWriter) maybeFlush() (err error) { 555 wr.lock.RLock() 556 empty := len(wr.buf) == 0 557 wr.lock.RUnlock() 558 if empty { 559 return 560 } 561 wr.lock.Lock() 562 defer wr.lock.Unlock() 563 return wr.flush() 564 } 565 566 type journalWriterSnapshot struct { 567 io.Reader 568 closer func() error 569 } 570 571 func (s journalWriterSnapshot) Close() error { 572 return s.closer() 573 } 574 575 // snapshot returns an io.Reader with a consistent view of 576 // the current state of the journal file. 577 func (wr *journalWriter) snapshot() (io.ReadCloser, int64, error) { 578 wr.lock.Lock() 579 defer wr.lock.Unlock() 580 if err := wr.flush(); err != nil { 581 return nil, 0, err 582 } 583 // open a new file descriptor with an 584 // independent lifecycle from |wr.file| 585 f, err := os.Open(wr.path) 586 if err != nil { 587 return nil, 0, err 588 } 589 return journalWriterSnapshot{ 590 io.LimitReader(f, wr.off), 591 func() error { 592 return f.Close() 593 }, 594 }, wr.off, nil 595 } 596 597 func (wr *journalWriter) offset() int64 { 598 return wr.off + int64(len(wr.buf)) 599 } 600 601 func (wr *journalWriter) currentSize() int64 { 602 wr.lock.RLock() 603 defer wr.lock.RUnlock() 604 return wr.offset() 605 } 606 607 func (wr *journalWriter) uncompressedSize() uint64 { 608 wr.lock.RLock() 609 defer wr.lock.RUnlock() 610 return wr.uncmpSz 611 } 612 613 func (wr *journalWriter) recordCount() uint32 { 614 wr.lock.RLock() 615 defer wr.lock.RUnlock() 616 return wr.ranges.count() 617 } 618 619 func (wr *journalWriter) Close() (err error) { 620 wr.lock.Lock() 621 defer wr.lock.Unlock() 622 623 if wr.journal == nil { 624 logrus.Warnf("journal writer has already been closed (%s)", wr.path) 625 return nil 626 } 627 628 if err = wr.flush(); err != nil { 629 return err 630 } 631 if wr.index != nil { 632 _ = wr.indexWriter.Flush() 633 _ = wr.index.Close() 634 } 635 if cerr := wr.journal.Sync(); cerr != nil { 636 err = cerr 637 } 638 if cerr := wr.journal.Close(); cerr != nil { 639 err = cerr 640 } else { 641 // Nil out the journal after the file has been closed, so that it's obvious it's been closed 642 wr.journal = nil 643 } 644 645 return err 646 } 647 648 // A rangeIndex maps chunk addresses to read Ranges in the chunk journal file. 649 type rangeIndex struct { 650 // novel Ranges represent most recent chunks written to 651 // the journal. These Ranges have not yet been written to 652 // a journal index record. 653 novel *swiss.Map[hash.Hash, Range] 654 655 // cached Ranges are bootstrapped from an out-of-band journal 656 // index file. To save memory, these Ranges are keyed by a 16-byte 657 // prefix of their addr which is assumed to be globally unique 658 cached *swiss.Map[addr16, Range] 659 } 660 661 type addr16 [16]byte 662 663 func toAddr16(full hash.Hash) (prefix addr16) { 664 copy(prefix[:], full[:]) 665 return 666 } 667 668 func newRangeIndex() rangeIndex { 669 return rangeIndex{ 670 novel: swiss.NewMap[hash.Hash, Range](journalIndexDefaultMaxNovel), 671 cached: swiss.NewMap[addr16, Range](0), 672 } 673 } 674 675 func estimateRangeCount(info os.FileInfo) uint32 { 676 return uint32(info.Size()/32) + journalIndexDefaultMaxNovel 677 } 678 679 func (idx rangeIndex) get(h hash.Hash) (rng Range, ok bool) { 680 rng, ok = idx.novel.Get(h) 681 if !ok { 682 rng, ok = idx.cached.Get(toAddr16(h)) 683 } 684 return 685 } 686 687 func (idx rangeIndex) put(h hash.Hash, rng Range) { 688 idx.novel.Put(h, rng) 689 } 690 691 func (idx rangeIndex) putCached(a addr16, rng Range) { 692 idx.cached.Put(a, rng) 693 } 694 695 func (idx rangeIndex) count() uint32 { 696 return uint32(idx.novel.Count() + idx.cached.Count()) 697 } 698 699 func (idx rangeIndex) novelCount() int { 700 return idx.novel.Count() 701 } 702 703 func (idx rangeIndex) novelLookups() (lookups []lookup) { 704 lookups = make([]lookup, 0, idx.novel.Count()) 705 idx.novel.Iter(func(a hash.Hash, r Range) (stop bool) { 706 lookups = append(lookups, lookup{a: toAddr16(a), r: r}) 707 return 708 }) 709 return 710 } 711 712 func (idx rangeIndex) flatten() rangeIndex { 713 idx.novel.Iter(func(a hash.Hash, r Range) (stop bool) { 714 idx.cached.Put(toAddr16(a), r) 715 return 716 }) 717 idx.novel = swiss.NewMap[hash.Hash, Range](journalIndexDefaultMaxNovel) 718 return idx 719 }