github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/journal.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "sort" 25 "strconv" 26 "time" 27 28 "github.com/dolthub/fslock" 29 "github.com/sirupsen/logrus" 30 31 "github.com/dolthub/dolt/go/libraries/doltcore/dconfig" 32 "github.com/dolthub/dolt/go/store/chunks" 33 "github.com/dolthub/dolt/go/store/hash" 34 ) 35 36 const ( 37 chunkJournalName = chunkJournalAddr // todo 38 ) 39 40 // reflogDisabled indicates whether access to the reflog has been disabled and if so, no chunk journal root references 41 // should be kept in memory. This is controlled by the DOLT_DISABLE_REFLOG env var and this var is ONLY written to 42 // during initialization. All access after initialization is read-only, so no additional locking is needed. 43 var reflogDisabled = false 44 45 // defaultReflogBufferSize controls how many of the most recent root references for root updates are kept in-memory. 46 // This default can be overridden by setting the DOLT_REFLOG_RECORD_LIMIT before Dolt starts. 47 const defaultReflogBufferSize = 5_000 48 49 func init() { 50 if os.Getenv(dconfig.EnvDisableReflog) != "" { 51 reflogDisabled = true 52 } 53 } 54 55 // ChunkJournal is a persistence abstraction for a NomsBlockStore. 56 // It implements both manifest and tablePersister, durably writing 57 // both memTable persists and manifest updates to a single file. 58 type ChunkJournal struct { 59 wr *journalWriter 60 path string 61 62 contents manifestContents 63 backing *journalManifest 64 persister *fsTablePersister 65 66 // reflogRingBuffer holds the most recent roots written to the chunk journal so that they can be 67 // quickly loaded for reflog queries without having to re-read the journal file from disk. 68 reflogRingBuffer *reflogRingBuffer 69 } 70 71 var _ tablePersister = &ChunkJournal{} 72 var _ tableFilePersister = &ChunkJournal{} 73 var _ manifest = &ChunkJournal{} 74 var _ manifestGCGenUpdater = &ChunkJournal{} 75 var _ io.Closer = &ChunkJournal{} 76 77 func newChunkJournal(ctx context.Context, nbfVers, dir string, m *journalManifest, p *fsTablePersister) (*ChunkJournal, error) { 78 path, err := filepath.Abs(filepath.Join(dir, chunkJournalName)) 79 if err != nil { 80 return nil, err 81 } 82 83 j := &ChunkJournal{path: path, backing: m, persister: p} 84 j.contents.nbfVers = nbfVers 85 j.reflogRingBuffer = newReflogRingBuffer(reflogBufferSize()) 86 87 ok, err := fileExists(path) 88 if err != nil { 89 return nil, err 90 } else if ok { 91 // only bootstrap journalWriter if the journal file exists, 92 // otherwise we wait to open in case we're cloning 93 if err = j.bootstrapJournalWriter(ctx); err != nil { 94 return nil, err 95 } 96 } 97 return j, nil 98 } 99 100 // reflogBufferSize returns the size of the ring buffer to allocate to store in-memory roots references when 101 // new roots are written to a chunk journal. If reflog queries have been disabled, this function will return 0. 102 // If the default buffer size has been overridden via DOLT_REFLOG_RECORD_LIMIT, that value will be returned if 103 // it can be successfully parsed. Otherwise, the default buffer size will be returned. 104 func reflogBufferSize() int { 105 if reflogDisabled { 106 return 0 107 } 108 109 reflogBufferSize := defaultReflogBufferSize 110 if limit := os.Getenv(dconfig.EnvReflogRecordLimit); limit != "" { 111 i, err := strconv.Atoi(limit) 112 if err != nil { 113 logrus.Warnf("unable to parse integer value for %s from %s: %s", 114 dconfig.EnvReflogRecordLimit, limit, err.Error()) 115 } else { 116 if i <= 0 { 117 reflogDisabled = true 118 } else { 119 reflogBufferSize = i 120 } 121 } 122 } 123 124 return reflogBufferSize 125 } 126 127 // bootstrapJournalWriter initializes the journalWriter, which manages access to the 128 // journal file for this ChunkJournal. The bootstrapping process differs depending 129 // on whether a journal file exists at startup time. 130 // 131 // If a journal file does not exist, we create one and commit a root hash record 132 // containing the root hash we read from the manifest file. 133 // 134 // If a journal file does exist, we process its records to build up an index of its 135 // resident chunks. Processing journal records is potentially accelerated by an index 136 // file (see indexRec). The journal file is the source of truth for latest root hash. 137 // As we process journal records, we keep track of the latest root hash record we see 138 // and update the manifest file with the last root hash we saw. 139 func (j *ChunkJournal) bootstrapJournalWriter(ctx context.Context) (err error) { 140 var ok bool 141 ok, err = fileExists(j.path) 142 if err != nil { 143 return err 144 } 145 146 if !ok { // create new journal file 147 j.wr, err = createJournalWriter(ctx, j.path) 148 if err != nil { 149 return err 150 } 151 152 _, err = j.wr.bootstrapJournal(ctx, j.reflogRingBuffer) 153 if err != nil { 154 return err 155 } 156 157 var contents manifestContents 158 ok, contents, err = j.backing.ParseIfExists(ctx, &Stats{}, nil) 159 if err != nil { 160 return err 161 } 162 if ok { 163 // write the current root hash to the journal file 164 if err = j.wr.commitRootHash(contents.root); err != nil { 165 return 166 } 167 j.contents = contents 168 } 169 return 170 } 171 172 j.wr, ok, err = openJournalWriter(ctx, j.path) 173 if err != nil { 174 return err 175 } else if !ok { 176 return errors.New("missing chunk journal " + j.path) 177 } 178 179 // parse existing journal file 180 root, err := j.wr.bootstrapJournal(ctx, j.reflogRingBuffer) 181 if err != nil { 182 return err 183 } 184 185 mc, err := trueUpBackingManifest(ctx, root, j.backing) 186 if err != nil { 187 return err 188 } 189 j.contents = mc 190 return 191 } 192 193 // the journal file is the source of truth for the root hash, true-up persisted manifest 194 func trueUpBackingManifest(ctx context.Context, root hash.Hash, backing *journalManifest) (manifestContents, error) { 195 ok, mc, err := backing.ParseIfExists(ctx, &Stats{}, nil) 196 if err != nil { 197 return manifestContents{}, err 198 } else if !ok { 199 return manifestContents{}, fmt.Errorf("manifest not found when opening chunk journal") 200 } 201 202 // set our in-memory root to match the journal 203 mc.root = root 204 if backing.readOnly() { 205 return mc, nil 206 } 207 208 prev := mc.lock 209 next := generateLockHash(mc.root, mc.specs, mc.appendix) 210 mc.lock = next 211 212 mc, err = backing.Update(ctx, prev, mc, &Stats{}, nil) 213 if err != nil { 214 return manifestContents{}, err 215 } else if mc.lock != next { 216 return manifestContents{}, errOptimisticLockFailedTables 217 } else if mc.root != root { 218 return manifestContents{}, errOptimisticLockFailedRoot 219 } 220 // true-up succeeded 221 return mc, nil 222 } 223 224 // IterateRoots iterates over the in-memory roots tracked by the ChunkJournal, from oldest root to newest root, 225 // and passes the root and associated timestamp to a callback function, |f|. If |f| returns an error, iteration 226 // is stopped and the error is returned. 227 func (j *ChunkJournal) IterateRoots(f func(root string, timestamp *time.Time) error) error { 228 return j.reflogRingBuffer.Iterate(func(entry reflogRootHashEntry) error { 229 // If we're reading a chunk journal written with an older version of Dolt, the root hash journal record may 230 // not have a timestamp value, so we'll have a time.Time instance in its zero value. If we see this, pass 231 // nil instead to signal to callers that there is no valid timestamp available. 232 var pTimestamp *time.Time = nil 233 if time.Time.IsZero(entry.timestamp) == false { 234 pTimestamp = &entry.timestamp 235 } 236 237 return f(entry.root, pTimestamp) 238 }) 239 } 240 241 // Persist implements tablePersister. 242 func (j *ChunkJournal) Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) { 243 if j.backing.readOnly() { 244 return nil, errReadOnlyManifest 245 } else if err := j.maybeInit(ctx); err != nil { 246 return nil, err 247 } 248 249 if haver != nil { 250 sort.Sort(hasRecordByPrefix(mt.order)) // hasMany() requires addresses to be sorted. 251 if _, err := haver.hasMany(mt.order); err != nil { 252 return nil, err 253 } 254 sort.Sort(hasRecordByOrder(mt.order)) // restore "insertion" order for write 255 } 256 257 for _, record := range mt.order { 258 if record.has { 259 continue 260 } 261 c := chunks.NewChunkWithHash(hash.Hash(*record.a), mt.chunks[*record.a]) 262 err := j.wr.writeCompressedChunk(ChunkToCompressedChunk(c)) 263 if err != nil { 264 return nil, err 265 } 266 } 267 return journalChunkSource{journal: j.wr}, nil 268 } 269 270 // ConjoinAll implements tablePersister. 271 func (j *ChunkJournal) ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, cleanupFunc, error) { 272 if j.backing.readOnly() { 273 return nil, nil, errReadOnlyManifest 274 } 275 return j.persister.ConjoinAll(ctx, sources, stats) 276 } 277 278 // Open implements tablePersister. 279 func (j *ChunkJournal) Open(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (chunkSource, error) { 280 if name == journalAddr { 281 if err := j.maybeInit(ctx); err != nil { 282 return nil, err 283 } 284 return journalChunkSource{journal: j.wr}, nil 285 } 286 return j.persister.Open(ctx, name, chunkCount, stats) 287 } 288 289 // Exists implements tablePersister. 290 func (j *ChunkJournal) Exists(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (bool, error) { 291 return j.persister.Exists(ctx, name, chunkCount, stats) 292 } 293 294 // PruneTableFiles implements tablePersister. 295 func (j *ChunkJournal) PruneTableFiles(ctx context.Context, keeper func() []hash.Hash, mtime time.Time) error { 296 if j.backing.readOnly() { 297 return errReadOnlyManifest 298 } 299 // sanity check that we're not deleting the journal 300 var keepJournal bool 301 for _, a := range keeper() { 302 if a == journalAddr { 303 keepJournal = true 304 } 305 } 306 if j.wr != nil && !keepJournal { 307 return errors.New("cannot drop chunk journal through tablePersister.PruneTableFiles()") 308 } 309 return j.persister.PruneTableFiles(ctx, keeper, mtime) 310 } 311 312 func (j *ChunkJournal) Path() string { 313 return filepath.Dir(j.path) 314 } 315 316 func (j *ChunkJournal) CopyTableFile(ctx context.Context, r io.Reader, fileId string, fileSz uint64, chunkCount uint32) error { 317 if j.backing.readOnly() { 318 return errReadOnlyManifest 319 } 320 return j.persister.CopyTableFile(ctx, r, fileId, fileSz, chunkCount) 321 } 322 323 // Name implements manifest. 324 func (j *ChunkJournal) Name() string { 325 return j.path 326 } 327 328 // Update implements manifest. 329 func (j *ChunkJournal) Update(ctx context.Context, lastLock hash.Hash, next manifestContents, stats *Stats, writeHook func() error) (manifestContents, error) { 330 if j.backing.readOnly() { 331 return j.contents, errReadOnlyManifest 332 } 333 334 if j.wr == nil { 335 // pass the update to |j.backing| if the journal is not initialized 336 return j.backing.Update(ctx, lastLock, next, stats, writeHook) 337 } 338 339 if j.contents.gcGen != next.gcGen { 340 return manifestContents{}, errors.New("use UpdateGCGen to update GC generation") 341 } else if j.contents.lock != lastLock { 342 return j.contents, nil // |next| is stale 343 } 344 345 if writeHook != nil { 346 if err := writeHook(); err != nil { 347 return manifestContents{}, err 348 } 349 } 350 351 // if |next| has a different table file set, flush to |j.backing| 352 if !equalSpecs(j.contents.specs, next.specs) { 353 if err := j.flushToBackingManifest(ctx, next, stats); err != nil { 354 return manifestContents{}, err 355 } 356 } 357 358 if err := j.wr.commitRootHash(next.root); err != nil { 359 return manifestContents{}, err 360 } 361 j.contents = next 362 363 // Update the in-memory structures so that the ChunkJournal can be queried for reflog data 364 if !reflogDisabled { 365 j.reflogRingBuffer.Push(reflogRootHashEntry{ 366 root: next.root.String(), 367 timestamp: time.Now(), 368 }) 369 } 370 371 return j.contents, nil 372 } 373 374 // UpdateGCGen implements manifestGCGenUpdater. 375 func (j *ChunkJournal) UpdateGCGen(ctx context.Context, lastLock hash.Hash, next manifestContents, stats *Stats, writeHook func() error) (manifestContents, error) { 376 if j.backing.readOnly() { 377 return j.contents, errReadOnlyManifest 378 } else if j.wr == nil { 379 // pass the update to |j.backing| if the journal is not initialized 380 return j.backing.UpdateGCGen(ctx, lastLock, next, stats, writeHook) 381 } else if j.contents.lock != lastLock { 382 return j.contents, nil // |next| is stale 383 } 384 385 // UpdateGCGen below cannot update the root hash, only the GC generation 386 // flush |j.contents| with the latest root hash here 387 if err := j.flushToBackingManifest(ctx, j.contents, stats); err != nil { 388 return manifestContents{}, err 389 } 390 391 latest, err := j.backing.UpdateGCGen(ctx, j.contents.lock, next, stats, writeHook) 392 if err != nil { 393 return manifestContents{}, err 394 } else if latest.root == next.root { 395 j.contents = next // success 396 } 397 398 // if we're landing a new manifest without the chunk journal 399 // then physically delete the journal here and cleanup |j.wr| 400 if !containsJournalSpec(latest.specs) { 401 if err = j.dropJournalWriter(ctx); err != nil { 402 return manifestContents{}, err 403 } 404 } 405 406 // Truncate the in-memory root and root timestamp metadata 407 if !reflogDisabled { 408 j.reflogRingBuffer.Truncate() 409 } 410 411 return latest, nil 412 } 413 414 // flushToBackingManifest attempts to update the backing file manifest with |next|. This is necessary 415 // when making manifest updates other than root hash updates (adding new table files, updating GC gen, etc). 416 func (j *ChunkJournal) flushToBackingManifest(ctx context.Context, next manifestContents, stats *Stats) error { 417 _, prev, err := j.backing.ParseIfExists(ctx, stats, nil) 418 if err != nil { 419 return err 420 } 421 var mc manifestContents 422 mc, err = j.backing.Update(ctx, prev.lock, next, stats, nil) 423 if err != nil { 424 return err 425 } else if mc.lock != next.lock { 426 return errOptimisticLockFailedTables 427 } 428 return nil 429 } 430 431 func (j *ChunkJournal) dropJournalWriter(ctx context.Context) error { 432 curr := j.wr 433 if j.wr == nil { 434 return nil 435 } 436 j.wr = nil 437 if err := curr.Close(); err != nil { 438 return err 439 } 440 return deleteJournalAndIndexFiles(ctx, curr.path) 441 } 442 443 // ParseIfExists implements manifest. 444 func (j *ChunkJournal) ParseIfExists(ctx context.Context, stats *Stats, readHook func() error) (ok bool, mc manifestContents, err error) { 445 if j.wr == nil { 446 // parse contents from |j.backing| if the journal is not initialized 447 return j.backing.ParseIfExists(ctx, stats, readHook) 448 } 449 if readHook != nil { 450 if err = readHook(); err != nil { 451 return false, manifestContents{}, err 452 } 453 } 454 ok, mc = true, j.contents 455 return 456 } 457 458 func (j *ChunkJournal) maybeInit(ctx context.Context) (err error) { 459 if j.wr == nil { 460 err = j.bootstrapJournalWriter(ctx) 461 } 462 return 463 } 464 465 // Close implements io.Closer 466 func (j *ChunkJournal) Close() (err error) { 467 if j.wr != nil { 468 err = j.wr.Close() 469 // flush the latest root to the backing manifest 470 if !j.backing.readOnly() { 471 cerr := j.flushToBackingManifest(context.Background(), j.contents, &Stats{}) 472 if err == nil { 473 err = cerr 474 } 475 } 476 } 477 // close the journal manifest to release the file lock 478 if cerr := j.backing.Close(); err == nil { 479 err = cerr // keep first error 480 } 481 482 return err 483 } 484 485 func (j *ChunkJournal) AccessMode() chunks.ExclusiveAccessMode { 486 if j.backing.readOnly() { 487 return chunks.ExclusiveAccessMode_ReadOnly 488 } 489 return chunks.ExclusiveAccessMode_Exclusive 490 } 491 492 type journalConjoiner struct { 493 child conjoinStrategy 494 } 495 496 func (c journalConjoiner) conjoinRequired(ts tableSet) bool { 497 return c.child.conjoinRequired(ts) 498 } 499 500 func (c journalConjoiner) chooseConjoinees(upstream []tableSpec) (conjoinees, keepers []tableSpec, err error) { 501 var stash tableSpec // don't conjoin journal 502 pruned := make([]tableSpec, 0, len(upstream)) 503 for _, ts := range upstream { 504 if isJournalAddr(ts.name) { 505 stash = ts 506 } else { 507 pruned = append(pruned, ts) 508 } 509 } 510 conjoinees, keepers, err = c.child.chooseConjoinees(pruned) 511 if err != nil { 512 return nil, nil, err 513 } 514 if !hash.Hash(stash.name).IsEmpty() { 515 keepers = append(keepers, stash) 516 } 517 return 518 } 519 520 // newJournalManifest makes a new file manifest. 521 func newJournalManifest(ctx context.Context, dir string) (m *journalManifest, err error) { 522 lock := fslock.New(filepath.Join(dir, lockFileName)) 523 // try to take the file lock. if we fail, make the manifest read-only. 524 // if we succeed, hold the file lock until we close the journalManifest 525 err = lock.LockWithTimeout(lockFileTimeout) 526 if errors.Is(err, fslock.ErrTimeout) { 527 lock, err = nil, nil // read only 528 } else if err != nil { 529 return nil, err 530 } 531 m = &journalManifest{dir: dir, lock: lock} 532 533 var f *os.File 534 f, err = openIfExists(filepath.Join(dir, manifestFileName)) 535 if err != nil { 536 if lock != nil { 537 _ = lock.Unlock() 538 } 539 return nil, err 540 } else if f == nil { 541 return m, nil 542 } 543 defer func() { 544 if cerr := f.Close(); err == nil { 545 err = cerr // keep first error 546 } 547 if err != nil { 548 if lock != nil { 549 _ = lock.Unlock() 550 } 551 } 552 }() 553 554 var ok bool 555 ok, _, err = m.ParseIfExists(ctx, &Stats{}, nil) 556 if err != nil { 557 if lock != nil { 558 _ = lock.Unlock() 559 } 560 return nil, err 561 } else if !ok { 562 if lock != nil { 563 _ = lock.Unlock() 564 } 565 return nil, ErrUnreadableManifest 566 } 567 return 568 } 569 570 type journalManifest struct { 571 dir string 572 lock *fslock.Lock 573 } 574 575 func (jm *journalManifest) readOnly() bool { 576 return jm.lock == nil 577 } 578 579 // Name implements manifest. 580 func (jm *journalManifest) Name() string { 581 return jm.dir 582 } 583 584 // ParseIfExists implements manifest. 585 func (jm *journalManifest) ParseIfExists(ctx context.Context, stats *Stats, readHook func() error) (exists bool, contents manifestContents, err error) { 586 t1 := time.Now() 587 defer func() { stats.ReadManifestLatency.SampleTimeSince(t1) }() 588 return parseIfExists(ctx, jm.dir, readHook) 589 } 590 591 // Update implements manifest. 592 func (jm *journalManifest) Update(ctx context.Context, lastLock hash.Hash, newContents manifestContents, stats *Stats, writeHook func() error) (mc manifestContents, err error) { 593 if jm.readOnly() { 594 _, mc, err = jm.ParseIfExists(ctx, stats, nil) 595 if err != nil { 596 return manifestContents{}, err 597 } 598 // return current contents and sentinel error 599 return mc, errReadOnlyManifest 600 } 601 602 t1 := time.Now() 603 defer func() { stats.WriteManifestLatency.SampleTimeSince(t1) }() 604 checker := func(upstream, contents manifestContents) error { 605 if contents.gcGen != upstream.gcGen { 606 return chunks.ErrGCGenerationExpired 607 } 608 return nil 609 } 610 return updateWithChecker(ctx, jm.dir, syncFlush, checker, lastLock, newContents, writeHook) 611 } 612 613 // UpdateGCGen implements manifest. 614 func (jm *journalManifest) UpdateGCGen(ctx context.Context, lastLock hash.Hash, newContents manifestContents, stats *Stats, writeHook func() error) (mc manifestContents, err error) { 615 if jm.readOnly() { 616 _, mc, err = jm.ParseIfExists(ctx, stats, nil) 617 if err != nil { 618 return manifestContents{}, err 619 } 620 // return current contents and sentinel error 621 return mc, errReadOnlyManifest 622 } 623 624 t1 := time.Now() 625 defer func() { stats.WriteManifestLatency.SampleTimeSince(t1) }() 626 checker := func(upstream, contents manifestContents) error { 627 if contents.gcGen == upstream.gcGen { 628 return errors.New("UpdateGCGen() must update the garbage collection generation") 629 } else if contents.root != upstream.root { 630 return errors.New("UpdateGCGen() cannot update the root") 631 } 632 return nil 633 } 634 return updateWithChecker(ctx, jm.dir, syncFlush, checker, lastLock, newContents, writeHook) 635 } 636 637 func (jm *journalManifest) Close() (err error) { 638 if jm.lock != nil { 639 err = jm.lock.Unlock() 640 jm.lock = nil 641 } 642 return 643 } 644 645 func containsJournalSpec(specs []tableSpec) (ok bool) { 646 for _, spec := range specs { 647 if spec.name == journalAddr { 648 ok = true 649 break 650 } 651 } 652 return 653 }