github.com/onflow/flow-go@v0.33.17/ledger/complete/wal/checkpointer.go (about) 1 package wal 2 3 import ( 4 "bufio" 5 "encoding/binary" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "os" 10 "path" 11 "path/filepath" 12 "sort" 13 "strconv" 14 "strings" 15 16 "github.com/docker/go-units" 17 "github.com/rs/zerolog" 18 "github.com/rs/zerolog/log" 19 "golang.org/x/sync/errgroup" 20 21 "github.com/onflow/flow-go/ledger" 22 "github.com/onflow/flow-go/ledger/complete/mtrie" 23 "github.com/onflow/flow-go/ledger/complete/mtrie/flattener" 24 "github.com/onflow/flow-go/ledger/complete/mtrie/node" 25 "github.com/onflow/flow-go/ledger/complete/mtrie/trie" 26 "github.com/onflow/flow-go/model/bootstrap" 27 "github.com/onflow/flow-go/module/metrics" 28 "github.com/onflow/flow-go/module/util" 29 utilsio "github.com/onflow/flow-go/utils/io" 30 ) 31 32 const checkpointFilenamePrefix = "checkpoint." 33 34 const MagicBytesCheckpointHeader uint16 = 0x2137 35 const MagicBytesCheckpointSubtrie uint16 = 0x2136 36 const MagicBytesCheckpointToptrie uint16 = 0x2135 37 38 const VersionV1 uint16 = 0x01 39 40 // Versions was reset while changing trie format, so now bump it to 3 to avoid conflicts 41 // Version 3 contains a file checksum for detecting corrupted checkpoint files. 42 const VersionV3 uint16 = 0x03 43 44 // Version 4 contains a footer with node count and trie count (previously in the header). 45 // Version 4 also reduces checkpoint data size. See EncodeNode() and EncodeTrie() for more details. 46 const VersionV4 uint16 = 0x04 47 48 // Version 5 includes these changes: 49 // - remove regCount and maxDepth from serialized nodes 50 // - add allocated register count and size to serialized tries 51 // - reduce number of bytes used to encode payload value size from 8 bytes to 4 bytes. 52 // See EncodeNode() and EncodeTrie() for more details. 53 const VersionV5 uint16 = 0x05 54 55 // Version 6 includes these changes: 56 // - trie nodes are stored in additional 17 checkpoint files, with .0, .1, .2, ... .16 as 57 // file name extension 58 const VersionV6 uint16 = 0x06 59 60 // MaxVersion is the latest checkpoint version we support. 61 // Need to update MaxVersion when creating a newer version. 62 const MaxVersion = VersionV6 63 64 const ( 65 encMagicSize = 2 66 encVersionSize = 2 67 headerSize = encMagicSize + encVersionSize 68 encSubtrieCountSize = 2 69 encNodeCountSize = 8 70 encTrieCountSize = 2 71 crc32SumSize = 4 72 ) 73 74 // defaultBufioReadSize replaces the default bufio buffer size of 4096 bytes. 75 // defaultBufioReadSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it 76 // improves performance on typical EN hardware. 77 const defaultBufioReadSize = 1024 * 32 78 79 // defaultBufioWriteSize replaces the default bufio buffer size of 4096 bytes. 80 // defaultBufioWriteSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it 81 // improves performance on typical EN hardware. 82 const defaultBufioWriteSize = 1024 * 32 83 84 type Checkpointer struct { 85 dir string 86 wal *DiskWAL 87 keyByteSize int 88 forestCapacity int 89 } 90 91 func NewCheckpointer(wal *DiskWAL, keyByteSize int, forestCapacity int) *Checkpointer { 92 return &Checkpointer{ 93 dir: wal.wal.Dir(), 94 wal: wal, 95 keyByteSize: keyByteSize, 96 forestCapacity: forestCapacity, 97 } 98 } 99 100 // listCheckpoints returns all the numbers (unsorted) of the checkpoint files, and the number of the last checkpoint. 101 func (c *Checkpointer) listCheckpoints() ([]int, int, error) { 102 return ListCheckpoints(c.dir) 103 } 104 105 // ListCheckpoints returns all the numbers of the checkpoint files, and the number of the last checkpoint. 106 // note, it doesn't include the root checkpoint file 107 func ListCheckpoints(dir string) ([]int, int, error) { 108 list := make([]int, 0) 109 110 files, err := os.ReadDir(dir) 111 if err != nil { 112 return nil, -1, fmt.Errorf("cannot list directory [%s] content: %w", dir, err) 113 } 114 last := -1 115 for _, fn := range files { 116 fname := fn.Name() 117 if !strings.HasPrefix(fname, checkpointFilenamePrefix) { 118 continue 119 } 120 justNumber := fname[len(checkpointFilenamePrefix):] 121 k, err := strconv.Atoi(justNumber) 122 if err != nil { 123 continue 124 } 125 126 list = append(list, k) 127 128 // the last check point is the one with the highest number 129 if k > last { 130 last = k 131 } 132 } 133 134 return list, last, nil 135 } 136 137 // Checkpoints returns all the numbers of the checkpoint files in asc order. 138 // note, it doesn't include the root checkpoint file 139 func (c *Checkpointer) Checkpoints() ([]int, error) { 140 return Checkpoints(c.dir) 141 } 142 143 // Checkpoints returns all the checkpoint numbers in asc order 144 func Checkpoints(dir string) ([]int, error) { 145 list, _, err := ListCheckpoints(dir) 146 if err != nil { 147 return nil, fmt.Errorf("could not fetch all checkpoints: %w", err) 148 } 149 150 sort.Ints(list) 151 152 return list, nil 153 } 154 155 // LatestCheckpoint returns number of latest checkpoint or -1 if there are no checkpoints 156 func (c *Checkpointer) LatestCheckpoint() (int, error) { 157 _, last, err := c.listCheckpoints() 158 return last, err 159 } 160 161 // NotCheckpointedSegments - returns numbers of segments which are not checkpointed yet, 162 // or -1, -1 if there are no segments 163 func (c *Checkpointer) NotCheckpointedSegments() (from, to int, err error) { 164 165 latestCheckpoint, err := c.LatestCheckpoint() 166 if err != nil { 167 return -1, -1, fmt.Errorf("cannot get last checkpoint: %w", err) 168 } 169 170 first, last, err := c.wal.Segments() 171 if err != nil { 172 return -1, -1, fmt.Errorf("cannot get range of segments: %w", err) 173 } 174 175 // there are no segments at all, there is nothing to checkpoint 176 if first == -1 && last == -1 { 177 return -1, -1, nil 178 } 179 180 // no checkpoints 181 if latestCheckpoint == -1 { 182 return first, last, nil 183 } 184 185 // segments before checkpoint 186 if last <= latestCheckpoint { 187 return -1, -1, nil 188 } 189 190 // there is gap between last checkpoint and segments 191 if last > latestCheckpoint && latestCheckpoint < first-1 { 192 return -1, -1, fmt.Errorf("gap between last checkpoint and segments") 193 } 194 195 return latestCheckpoint + 1, last, nil 196 } 197 198 // Checkpoint creates new checkpoint stopping at given segment 199 func (c *Checkpointer) Checkpoint(to int) (err error) { 200 201 _, notCheckpointedTo, err := c.NotCheckpointedSegments() 202 if err != nil { 203 return fmt.Errorf("cannot get not checkpointed segments: %w", err) 204 } 205 206 latestCheckpoint, err := c.LatestCheckpoint() 207 if err != nil { 208 return fmt.Errorf("cannot get latest checkpoint: %w", err) 209 } 210 211 if latestCheckpoint == to { 212 return nil //nothing to do 213 } 214 215 if notCheckpointedTo < to { 216 return fmt.Errorf("no segments to checkpoint to %d, latests not checkpointed segment: %d", to, notCheckpointedTo) 217 } 218 219 forest, err := mtrie.NewForest(c.forestCapacity, &metrics.NoopCollector{}, nil) 220 if err != nil { 221 return fmt.Errorf("cannot create Forest: %w", err) 222 } 223 224 c.wal.log.Info().Msgf("creating checkpoint %d", to) 225 226 err = c.wal.replay(0, to, 227 func(tries []*trie.MTrie) error { 228 return forest.AddTries(tries) 229 }, 230 func(update *ledger.TrieUpdate) error { 231 _, err := forest.Update(update) 232 return err 233 }, func(rootHash ledger.RootHash) error { 234 return nil 235 }, true) 236 237 if err != nil { 238 return fmt.Errorf("cannot replay WAL: %w", err) 239 } 240 241 tries, err := forest.GetTries() 242 if err != nil { 243 return fmt.Errorf("cannot get forest tries: %w", err) 244 } 245 246 c.wal.log.Info().Msgf("serializing checkpoint %d", to) 247 248 fileName := NumberToFilename(to) 249 250 err = StoreCheckpointV6SingleThread(tries, c.wal.dir, fileName, c.wal.log) 251 252 if err != nil { 253 return fmt.Errorf("could not create checkpoint for %v: %w", to, err) 254 } 255 256 checkpointFileSize, err := ReadCheckpointFileSize(c.wal.dir, fileName) 257 if err != nil { 258 return fmt.Errorf("could not read checkpoint file size: %w", err) 259 } 260 261 c.wal.log.Info(). 262 Str("checkpoint_file_size", units.BytesSize(float64(checkpointFileSize))). 263 Msgf("created checkpoint %d with %d tries", to, len(tries)) 264 265 return nil 266 } 267 268 func NumberToFilenamePart(n int) string { 269 return fmt.Sprintf("%08d", n) 270 } 271 272 func NumberToFilename(n int) string { 273 274 return fmt.Sprintf("%s%s", checkpointFilenamePrefix, NumberToFilenamePart(n)) 275 } 276 277 func (c *Checkpointer) CheckpointWriter(to int) (io.WriteCloser, error) { 278 return CreateCheckpointWriterForFile(c.dir, NumberToFilename(to), c.wal.log) 279 } 280 281 func (c *Checkpointer) Dir() string { 282 return c.dir 283 } 284 285 // CreateCheckpointWriterForFile returns a file writer that will write to a temporary file and then move it to the checkpoint folder by renaming it. 286 func CreateCheckpointWriterForFile(dir, filename string, logger zerolog.Logger) (io.WriteCloser, error) { 287 288 fullname := path.Join(dir, filename) 289 290 if utilsio.FileExists(fullname) { 291 return nil, fmt.Errorf("checkpoint file %s already exists", fullname) 292 } 293 294 tmpFile, err := os.CreateTemp(dir, "writing-chkpnt-*") 295 if err != nil { 296 return nil, fmt.Errorf("cannot create temporary file for checkpoint %v: %w", tmpFile, err) 297 } 298 299 writer := bufio.NewWriterSize(tmpFile, defaultBufioWriteSize) 300 return &SyncOnCloseRenameFile{ 301 logger: logger, 302 file: tmpFile, 303 targetName: fullname, 304 Writer: writer, 305 }, nil 306 } 307 308 // StoreCheckpointV5 writes the given tries to checkpoint file, and also appends 309 // a CRC32 file checksum for integrity check. 310 // Checkpoint file consists of a flattened forest. Specifically, it consists of: 311 // - a list of encoded nodes, where references to other nodes are by list index. 312 // - a list of encoded tries, each referencing their respective root node by index. 313 // 314 // Referencing to other nodes by index 0 is a special case, meaning nil. 315 // 316 // As an important property, the nodes are listed in an order which satisfies 317 // Descendents-First-Relationship. The Descendents-First-Relationship has the 318 // following important property: 319 // When rebuilding the trie from the sequence of nodes, build the trie on the fly, 320 // as for each node, the children have been previously encountered. 321 // TODO: evaluate alternatives to CRC32 since checkpoint file is many GB in size. 322 // TODO: add concurrency if the performance gains are enough to offset complexity. 323 func StoreCheckpointV5(dir string, fileName string, logger zerolog.Logger, tries ...*trie.MTrie) ( 324 // error 325 // Note, the above code, which didn't define the name "err" for the returned error, would be wrong, 326 // beause err needs to be defined in order to be updated by the defer function 327 errToReturn error, 328 ) { 329 writer, err := CreateCheckpointWriterForFile(dir, fileName, logger) 330 if err != nil { 331 return fmt.Errorf("could not create writer: %w", err) 332 } 333 defer func() { 334 errToReturn = closeAndMergeError(writer, errToReturn) 335 }() 336 337 crc32Writer := NewCRC32Writer(writer) 338 339 // Scratch buffer is used as temporary buffer that node can encode into. 340 // Data in scratch buffer should be copied or used before scratch buffer is used again. 341 // If the scratch buffer isn't large enough, a new buffer will be allocated. 342 // However, 4096 bytes will be large enough to handle almost all payloads 343 // and 100% of interim nodes. 344 scratch := make([]byte, 1024*4) 345 346 // Write header: magic (2 bytes) + version (2 bytes) 347 header := scratch[:headerSize] 348 binary.BigEndian.PutUint16(header, MagicBytesCheckpointHeader) 349 binary.BigEndian.PutUint16(header[encMagicSize:], VersionV5) 350 351 _, err = crc32Writer.Write(header) 352 if err != nil { 353 return fmt.Errorf("cannot write checkpoint header: %w", err) 354 } 355 356 // Multiple tries might have shared nodes at higher level, However, we don't want to 357 // seralize duplicated nodes in the checkpoint file. In order to deduplicate, we build 358 // a map from unique nodes while iterating and seralizing the nodes to the checkpoint file. 359 // 360 // The map for deduplication contains all the trie nodes, which uses a lot of memory. 361 // In fact, we don't have to build a map for all nodes, since there are nodes which 362 // are never shared. Nodes can only be shared if and only if they are 363 // on the same path. In other words, nodes on different path won't be shared. 364 // If we group trie nodes by path, then we have more smaller groups of trie nodes from the same path, 365 // which might have duplication. And then for each group, we could build a smaller map for deduplication. 366 // Processing each group sequentially would allow us reduce operational memory. 367 // 368 // With this idea in mind, the seralization can be done in two steps: 369 // 1. serialize nodes in subtries (tries with root at subtrieLevel). 370 // 2. serialize remaining nodes (from trie root to subtrie root). 371 // For instance, if there are 3 top tries, and subtrieLevel is 4, then there will be 372 // (2 ^ 4) * 3 = 48 subtrie root nodes at level 4. 373 // Then step 1 will seralize the 48 subtrie root nodes into the checkpoint file, and 374 // then step 2 will seralize the 3 root nodes (level 0) and the interim nodes from level 1 to 3 into 375 // 376 // Step 1: 377 // 1. Find all the subtrie root nodes at subtrieLevel (level 4) 378 // 2. Group the subtrie by path. Since subtries in different group have different path, they won't have 379 // child nodes shared. Subtries in the same group might have duplication, we will build a map to deduplicate. 380 // 381 // subtrieLevel is number of edges from trie root to subtrie root. 382 // Trie root is at level 0. 383 const subtrieLevel = 4 384 385 // subtrieCount is number of subtries at subtrieLevel. 386 const subtrieCount = 1 << subtrieLevel 387 388 // since each trie has `subtrieCount` number of subtries at subtrieLevel, 389 // we create `subtrieCount` number of groups, each group contains all the subtrie root nodes 390 391 // subtrieRoots is an array of groups. 392 // Each group contains the subtrie roots of the same path at subtrieLevel for different tries. 393 // For example, if subtrieLevel is 4, then 394 // - subtrieRoots[0] is a list of all subtrie roots at path [0,0,0,0] 395 // - subtrieRoots[1] is a list of all subtrie roots at path [0,0,0,1] 396 // - subtrieRoots[subtrieCount-1] is a list of all subtrie roots at path [1,1,1,1] 397 // subtrie roots in subtrieRoots[0] have the same path, therefore might have shared child nodes. 398 var subtrieRoots [subtrieCount][]*node.Node 399 for i := 0; i < len(subtrieRoots); i++ { 400 subtrieRoots[i] = make([]*node.Node, len(tries)) 401 } 402 403 for trieIndex, t := range tries { 404 // subtries is an array with subtrieCount trie nodes 405 // in breadth-first order at subtrieLevel of the trie `t` 406 subtries := getNodesAtLevel(t.RootNode(), subtrieLevel) 407 for subtrieIndex, subtrieRoot := range subtries { 408 subtrieRoots[subtrieIndex][trieIndex] = subtrieRoot 409 } 410 } 411 412 // topLevelNodes contains all unique nodes of given tries 413 // from root to subtrie root and their index 414 // (ordered by node traversal sequence). 415 // Index 0 is a special case with nil node. 416 topLevelNodes := make(map[*node.Node]uint64, 1<<(subtrieLevel+1)) 417 topLevelNodes[nil] = 0 418 419 // nodeCounter is counter for all unique nodes. 420 // It starts from 1, as 0 marks nil node. 421 nodeCounter := uint64(1) 422 423 // estimatedSubtrieNodeCount is rough estimate of number of nodes in subtrie, 424 // assuming trie is a full binary tree. estimatedSubtrieNodeCount is used 425 // to preallocate traversedSubtrieNodes for memory efficiency. 426 estimatedSubtrieNodeCount := 0 427 if len(tries) > 0 { 428 estimatedTrieNodeCount := 2*int(tries[0].AllocatedRegCount()) - 1 429 estimatedSubtrieNodeCount = estimatedTrieNodeCount / subtrieCount 430 } 431 432 // Serialize subtrie nodes 433 for i, subTrieRoot := range subtrieRoots { 434 // traversedSubtrieNodes contains all unique nodes of subtries of the same path and their index. 435 traversedSubtrieNodes := make(map[*node.Node]uint64, estimatedSubtrieNodeCount) 436 // Index 0 is a special case with nil node. 437 traversedSubtrieNodes[nil] = 0 438 439 logging := logProgress(fmt.Sprintf("storing %v-th sub trie roots", i), estimatedSubtrieNodeCount, log.Logger) 440 for _, root := range subTrieRoot { 441 // Empty trie is always added to forest as starting point and 442 // empty trie's root is nil. It remains in the forest until evicted 443 // by trie queue exceeding capacity. 444 if root == nil { 445 continue 446 } 447 // Note: nodeCounter is to assign an global index to each node in the order of it being seralized 448 // into the checkpoint file. Therefore, it has to be reused when iterating each subtrie. 449 // storeUniqueNodes will add the unique visited node into traversedSubtrieNodes with key as the node 450 // itself, and value as n-th node being seralized in the checkpoint file. 451 nodeCounter, err = storeUniqueNodes(root, traversedSubtrieNodes, nodeCounter, scratch, crc32Writer, logging) 452 if err != nil { 453 return fmt.Errorf("fail to store nodes in step 1 for subtrie root %v: %w", root.Hash(), err) 454 } 455 // Save subtrie root node index in topLevelNodes, 456 // so when traversing top level tries 457 // (from level 0 to subtrieLevel) using topLevelNodes, 458 // node iterator skips subtrie as visited nodes. 459 topLevelNodes[root] = traversedSubtrieNodes[root] 460 } 461 } 462 463 // Step 2: 464 // Now all nodes above and include the subtrieLevel have been seralized. We now 465 // serialize remaining nodes of each trie from root node (level 0) to (subtrieLevel - 1). 466 for _, t := range tries { 467 root := t.RootNode() 468 if root == nil { 469 continue 470 } 471 // if we iterate through the root trie with an empty visited nodes map, then it will iterate through 472 // all nodes at all levels. In order to skip the nodes above subtrieLevel, since they have been seralized in step 1, 473 // we will need to pass in a visited nodes map that contains all the subtrie root nodes, which is the topLevelNodes. 474 // The topLevelNodes was built in step 1, when seralizing each subtrie root nodes. 475 nodeCounter, err = storeUniqueNodes(root, topLevelNodes, nodeCounter, scratch, crc32Writer, func(uint64) {}) 476 if err != nil { 477 return fmt.Errorf("fail to store nodes in step 2 for root trie %v: %w", root.Hash(), err) 478 } 479 } 480 481 // The root tries are seralized at the end of the checkpoint file, so that it's easy to find what tries are 482 // included. 483 for _, t := range tries { 484 rootNode := t.RootNode() 485 if !t.IsEmpty() && rootNode.Height() != ledger.NodeMaxHeight { 486 return fmt.Errorf("height of root node must be %d, but is %d", 487 ledger.NodeMaxHeight, rootNode.Height()) 488 } 489 490 // Get root node index 491 rootIndex, found := topLevelNodes[rootNode] 492 if !found { 493 rootHash := t.RootHash() 494 return fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(rootHash[:])) 495 } 496 497 encTrie := flattener.EncodeTrie(t, rootIndex, scratch) 498 _, err = crc32Writer.Write(encTrie) 499 if err != nil { 500 return fmt.Errorf("cannot serialize trie: %w", err) 501 } 502 } 503 504 // all trie nodes have been seralized into the checkpoint file, now 505 // write footer with nodes count and tries count. 506 footer := scratch[:encNodeCountSize+encTrieCountSize] 507 binary.BigEndian.PutUint64(footer, nodeCounter-1) // -1 to account for 0 node meaning nil 508 binary.BigEndian.PutUint16(footer[encNodeCountSize:], uint16(len(tries))) 509 510 _, err = crc32Writer.Write(footer) 511 if err != nil { 512 return fmt.Errorf("cannot write checkpoint footer: %w", err) 513 } 514 515 // Write CRC32 sum of the footer for validation 516 crc32buf := scratch[:crc32SumSize] 517 binary.BigEndian.PutUint32(crc32buf, crc32Writer.Crc32()) 518 519 _, err = writer.Write(crc32buf) 520 if err != nil { 521 return fmt.Errorf("cannot write CRC32: %w", err) 522 } 523 524 return nil 525 } 526 527 func logProgress(msg string, estimatedSubtrieNodeCount int, logger zerolog.Logger) func(nodeCounter uint64) { 528 lg := util.LogProgress( 529 logger, 530 util.DefaultLogProgressConfig( 531 msg, 532 estimatedSubtrieNodeCount, 533 ), 534 ) 535 return func(index uint64) { 536 lg(1) 537 } 538 } 539 540 // storeUniqueNodes iterates and serializes unique nodes for trie with given root node. 541 // It also saves unique nodes and node counter in visitedNodes map. 542 // It returns nodeCounter and error (if any). 543 func storeUniqueNodes( 544 root *node.Node, 545 visitedNodes map[*node.Node]uint64, 546 nodeCounter uint64, 547 scratch []byte, 548 writer io.Writer, 549 nodeCounterUpdated func(nodeCounter uint64), // for logging estimated progress 550 ) (uint64, error) { 551 552 for itr := flattener.NewUniqueNodeIterator(root, visitedNodes); itr.Next(); { 553 n := itr.Value() 554 555 visitedNodes[n] = nodeCounter 556 nodeCounter++ 557 nodeCounterUpdated(nodeCounter) 558 559 var lchildIndex, rchildIndex uint64 560 561 if lchild := n.LeftChild(); lchild != nil { 562 var found bool 563 lchildIndex, found = visitedNodes[lchild] 564 if !found { 565 hash := lchild.Hash() 566 return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:])) 567 } 568 } 569 if rchild := n.RightChild(); rchild != nil { 570 var found bool 571 rchildIndex, found = visitedNodes[rchild] 572 if !found { 573 hash := rchild.Hash() 574 return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:])) 575 } 576 } 577 578 encNode := flattener.EncodeNode(n, lchildIndex, rchildIndex, scratch) 579 _, err := writer.Write(encNode) 580 if err != nil { 581 return 0, fmt.Errorf("cannot serialize node: %w", err) 582 } 583 } 584 585 return nodeCounter, nil 586 } 587 588 // getNodesAtLevel returns 2^level nodes at given level in breadth-first order. 589 // It guarantees size and order of returned nodes (nil element if no node at the position). 590 // For example, given nil root and level 3, getNodesAtLevel returns a slice 591 // of 2^3 nil elements. 592 func getNodesAtLevel(root *node.Node, level uint) []*node.Node { 593 nodes := []*node.Node{root} 594 nodesLevel := uint(0) 595 596 // Use breadth first traversal to get all nodes at given level. 597 // If a node isn't found, a nil node is used in its place. 598 for nodesLevel < level { 599 nextLevel := nodesLevel + 1 600 nodesAtNextLevel := make([]*node.Node, 1<<nextLevel) 601 602 for i, n := range nodes { 603 if n != nil { 604 nodesAtNextLevel[i*2] = n.LeftChild() 605 nodesAtNextLevel[i*2+1] = n.RightChild() 606 } 607 } 608 609 nodes = nodesAtNextLevel 610 nodesLevel = nextLevel 611 } 612 613 return nodes 614 } 615 616 func (c *Checkpointer) LoadCheckpoint(checkpoint int) ([]*trie.MTrie, error) { 617 filepath := path.Join(c.dir, NumberToFilename(checkpoint)) 618 return LoadCheckpoint(filepath, c.wal.log) 619 } 620 621 func (c *Checkpointer) LoadRootCheckpoint() ([]*trie.MTrie, error) { 622 filepath := path.Join(c.dir, bootstrap.FilenameWALRootCheckpoint) 623 return LoadCheckpoint(filepath, c.wal.log) 624 } 625 626 func (c *Checkpointer) HasRootCheckpoint() (bool, error) { 627 return HasRootCheckpoint(c.dir) 628 } 629 630 func HasRootCheckpoint(dir string) (bool, error) { 631 if _, err := os.Stat(path.Join(dir, bootstrap.FilenameWALRootCheckpoint)); err == nil { 632 return true, nil 633 } else if os.IsNotExist(err) { 634 return false, nil 635 } else { 636 return false, err 637 } 638 } 639 640 func (c *Checkpointer) RemoveCheckpoint(checkpoint int) error { 641 name := NumberToFilename(checkpoint) 642 return deleteCheckpointFiles(c.dir, name) 643 } 644 645 func LoadCheckpoint(filepath string, logger zerolog.Logger) ( 646 tries []*trie.MTrie, 647 errToReturn error) { 648 file, err := os.Open(filepath) 649 if err != nil { 650 return nil, fmt.Errorf("cannot open checkpoint file %s: %w", filepath, err) 651 } 652 defer func() { 653 evictErr := evictFileFromLinuxPageCache(file, false, logger) 654 if evictErr != nil { 655 logger.Warn().Msgf("failed to evict file %s from Linux page cache: %s", filepath, evictErr) 656 // No need to return this error because it's possible to continue normal operations. 657 } 658 659 errToReturn = closeAndMergeError(file, errToReturn) 660 }() 661 662 return readCheckpoint(file, logger) 663 } 664 665 func readCheckpoint(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) { 666 667 // Read header: magic (2 bytes) + version (2 bytes) 668 header := make([]byte, headerSize) 669 _, err := io.ReadFull(f, header) 670 if err != nil { 671 return nil, fmt.Errorf("cannot read header: %w", err) 672 } 673 674 // Decode header 675 magicBytes := binary.BigEndian.Uint16(header) 676 version := binary.BigEndian.Uint16(header[encMagicSize:]) 677 678 // Reset offset 679 _, err = f.Seek(0, io.SeekStart) 680 if err != nil { 681 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 682 } 683 684 if magicBytes != MagicBytesCheckpointHeader { 685 return nil, fmt.Errorf("unknown file format. Magic constant %x does not match expected %x", magicBytes, MagicBytesCheckpointHeader) 686 } 687 688 switch version { 689 case VersionV1, VersionV3: 690 return readCheckpointV3AndEarlier(f, version) 691 case VersionV4: 692 return readCheckpointV4(f) 693 case VersionV5: 694 return readCheckpointV5(f, logger) 695 case VersionV6: 696 return readCheckpointV6(f, logger) 697 default: 698 return nil, fmt.Errorf("unsupported file version %x", version) 699 } 700 } 701 702 type nodeWithRegMetrics struct { 703 n *node.Node 704 regCount uint64 705 regSize uint64 706 } 707 708 // readCheckpointV3AndEarlier deserializes checkpoint file (version 3 and earlier) and returns a list of tries. 709 // Header (magic and version) is verified by the caller. 710 // This function is for backwards compatibility, not optimized. 711 func readCheckpointV3AndEarlier(f *os.File, version uint16) ([]*trie.MTrie, error) { 712 713 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 714 crcReader := NewCRC32Reader(bufReader) 715 716 var reader io.Reader 717 718 if version != VersionV3 { 719 reader = bufReader 720 } else { 721 reader = crcReader 722 } 723 724 // Read header (magic + version), node count, and trie count. 725 header := make([]byte, headerSize+encNodeCountSize+encTrieCountSize) 726 727 _, err := io.ReadFull(reader, header) 728 if err != nil { 729 return nil, fmt.Errorf("cannot read header: %w", err) 730 } 731 732 // Magic and version are verified by the caller. 733 734 // Decode node count and trie count 735 nodesCount := binary.BigEndian.Uint64(header[headerSize:]) 736 triesCount := binary.BigEndian.Uint16(header[headerSize+encNodeCountSize:]) 737 738 nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil 739 tries := make([]*trie.MTrie, triesCount) 740 741 for i := uint64(1); i <= nodesCount; i++ { 742 n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 743 if nodeIndex >= uint64(i) { 744 return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship") 745 } 746 nm := nodes[nodeIndex] 747 return nm.n, nm.regCount, nm.regSize, nil 748 }) 749 if err != nil { 750 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 751 } 752 nodes[i].n = n 753 nodes[i].regCount = regCount 754 nodes[i].regSize = regSize 755 } 756 757 for i := uint16(0); i < triesCount; i++ { 758 trie, err := flattener.ReadTrieFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 759 if nodeIndex >= uint64(len(nodes)) { 760 return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node") 761 } 762 nm := nodes[nodeIndex] 763 return nm.n, nm.regCount, nm.regSize, nil 764 }) 765 if err != nil { 766 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 767 } 768 tries[i] = trie 769 } 770 771 if version == VersionV3 { 772 crc32buf := make([]byte, crc32SumSize) 773 774 _, err := io.ReadFull(bufReader, crc32buf) 775 if err != nil { 776 return nil, fmt.Errorf("cannot read CRC32: %w", err) 777 } 778 779 readCrc32 := binary.BigEndian.Uint32(crc32buf) 780 781 calculatedCrc32 := crcReader.Crc32() 782 783 if calculatedCrc32 != readCrc32 { 784 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 785 } 786 } 787 788 return tries, nil 789 } 790 791 // readCheckpointV4 decodes checkpoint file (version 4) and returns a list of tries. 792 // Header (magic and version) is verified by the caller. 793 // This function is for backwards compatibility. 794 func readCheckpointV4(f *os.File) ([]*trie.MTrie, error) { 795 796 // Scratch buffer is used as temporary buffer that reader can read into. 797 // Raw data in scratch buffer should be copied or converted into desired 798 // objects before next Read operation. If the scratch buffer isn't large 799 // enough, a new buffer will be allocated. However, 4096 bytes will 800 // be large enough to handle almost all payloads and 100% of interim nodes. 801 scratch := make([]byte, 1024*4) // must not be less than 1024 802 803 // Read footer to get node count and trie count 804 805 // footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes) 806 const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize 807 const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum 808 809 // Seek to footer 810 _, err := f.Seek(-footerOffset, io.SeekEnd) 811 if err != nil { 812 return nil, fmt.Errorf("cannot seek to footer: %w", err) 813 } 814 815 footer := scratch[:footerSize] 816 817 _, err = io.ReadFull(f, footer) 818 if err != nil { 819 return nil, fmt.Errorf("cannot read footer: %w", err) 820 } 821 822 // Decode node count and trie count 823 nodesCount := binary.BigEndian.Uint64(footer) 824 triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:]) 825 826 // Seek to the start of file 827 _, err = f.Seek(0, io.SeekStart) 828 if err != nil { 829 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 830 } 831 832 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 833 crcReader := NewCRC32Reader(bufReader) 834 var reader io.Reader = crcReader 835 836 // Read header: magic (2 bytes) + version (2 bytes) 837 // No action is needed for header because it is verified by the caller. 838 839 _, err = io.ReadFull(reader, scratch[:headerSize]) 840 if err != nil { 841 return nil, fmt.Errorf("cannot read header: %w", err) 842 } 843 844 // nodes's element at index 0 is a special, meaning nil . 845 nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil 846 tries := make([]*trie.MTrie, triesCount) 847 848 for i := uint64(1); i <= nodesCount; i++ { 849 n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 850 if nodeIndex >= uint64(i) { 851 return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship") 852 } 853 nm := nodes[nodeIndex] 854 return nm.n, nm.regCount, nm.regSize, nil 855 }) 856 if err != nil { 857 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 858 } 859 nodes[i].n = n 860 nodes[i].regCount = regCount 861 nodes[i].regSize = regSize 862 } 863 864 for i := uint16(0); i < triesCount; i++ { 865 trie, err := flattener.ReadTrieFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 866 if nodeIndex >= uint64(len(nodes)) { 867 return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node") 868 } 869 nm := nodes[nodeIndex] 870 return nm.n, nm.regCount, nm.regSize, nil 871 }) 872 if err != nil { 873 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 874 } 875 tries[i] = trie 876 } 877 878 // Read footer again for crc32 computation 879 // No action is needed. 880 _, err = io.ReadFull(reader, footer) 881 if err != nil { 882 return nil, fmt.Errorf("cannot read footer: %w", err) 883 } 884 885 // Read CRC32 886 crc32buf := scratch[:crc32SumSize] 887 _, err = io.ReadFull(bufReader, crc32buf) 888 if err != nil { 889 return nil, fmt.Errorf("cannot read CRC32: %w", err) 890 } 891 892 readCrc32 := binary.BigEndian.Uint32(crc32buf) 893 894 calculatedCrc32 := crcReader.Crc32() 895 896 if calculatedCrc32 != readCrc32 { 897 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 898 } 899 900 return tries, nil 901 } 902 903 // readCheckpointV5 decodes checkpoint file (version 5) and returns a list of tries. 904 // Checkpoint file header (magic and version) are verified by the caller. 905 func readCheckpointV5(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) { 906 logger.Info().Msgf("reading v5 checkpoint file") 907 908 // Scratch buffer is used as temporary buffer that reader can read into. 909 // Raw data in scratch buffer should be copied or converted into desired 910 // objects before next Read operation. If the scratch buffer isn't large 911 // enough, a new buffer will be allocated. However, 4096 bytes will 912 // be large enough to handle almost all payloads and 100% of interim nodes. 913 scratch := make([]byte, 1024*4) // must not be less than 1024 914 915 // Read footer to get node count and trie count 916 917 // footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes) 918 const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize 919 const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum 920 921 // Seek to footer 922 _, err := f.Seek(-footerOffset, io.SeekEnd) 923 if err != nil { 924 return nil, fmt.Errorf("cannot seek to footer: %w", err) 925 } 926 927 footer := scratch[:footerSize] 928 929 _, err = io.ReadFull(f, footer) 930 if err != nil { 931 return nil, fmt.Errorf("cannot read footer: %w", err) 932 } 933 934 // Decode node count and trie count 935 nodesCount := binary.BigEndian.Uint64(footer) 936 triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:]) 937 938 // Seek to the start of file 939 _, err = f.Seek(0, io.SeekStart) 940 if err != nil { 941 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 942 } 943 944 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 945 crcReader := NewCRC32Reader(bufReader) 946 var reader io.Reader = crcReader 947 948 // Read header: magic (2 bytes) + version (2 bytes) 949 // No action is needed for header because it is verified by the caller. 950 951 _, err = io.ReadFull(reader, scratch[:headerSize]) 952 if err != nil { 953 return nil, fmt.Errorf("cannot read header: %w", err) 954 } 955 956 // nodes's element at index 0 is a special, meaning nil . 957 nodes := make([]*node.Node, nodesCount+1) //+1 for 0 index meaning nil 958 tries := make([]*trie.MTrie, triesCount) 959 960 logging := logProgress("reading trie nodes", int(nodesCount), logger) 961 962 for i := uint64(1); i <= nodesCount; i++ { 963 n, err := flattener.ReadNode(reader, scratch, func(nodeIndex uint64) (*node.Node, error) { 964 if nodeIndex >= uint64(i) { 965 return nil, fmt.Errorf("sequence of serialized nodes does not satisfy Descendents-First-Relationship") 966 } 967 return nodes[nodeIndex], nil 968 }) 969 if err != nil { 970 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 971 } 972 nodes[i] = n 973 logging(i) 974 } 975 976 logger.Info().Msgf("finished loading %v trie nodes, start loading %v tries", nodesCount, triesCount) 977 978 for i := uint16(0); i < triesCount; i++ { 979 trie, err := flattener.ReadTrie(reader, scratch, func(nodeIndex uint64) (*node.Node, error) { 980 if nodeIndex >= uint64(len(nodes)) { 981 return nil, fmt.Errorf("sequence of stored nodes doesn't contain node") 982 } 983 return nodes[nodeIndex], nil 984 }) 985 if err != nil { 986 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 987 } 988 tries[i] = trie 989 } 990 991 // Read footer again for crc32 computation 992 // No action is needed. 993 _, err = io.ReadFull(reader, footer) 994 if err != nil { 995 return nil, fmt.Errorf("cannot read footer: %w", err) 996 } 997 998 // Read CRC32 999 crc32buf := scratch[:crc32SumSize] 1000 _, err = io.ReadFull(bufReader, crc32buf) 1001 if err != nil { 1002 return nil, fmt.Errorf("cannot read CRC32: %w", err) 1003 } 1004 1005 readCrc32 := binary.BigEndian.Uint32(crc32buf) 1006 1007 calculatedCrc32 := crcReader.Crc32() 1008 1009 if calculatedCrc32 != readCrc32 { 1010 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 1011 } 1012 1013 return tries, nil 1014 } 1015 1016 // evictFileFromLinuxPageCache advises Linux to evict a file from Linux page cache. 1017 // A use case is when a new checkpoint is loaded or created, Linux may cache big 1018 // checkpoint files in memory until evictFileFromLinuxPageCache causes them to be 1019 // evicted from the Linux page cache. Not calling eviceFileFromLinuxPageCache() 1020 // causes two checkpoint files to be cached for each checkpointing, eventually 1021 // caching hundreds of GB. 1022 // CAUTION: no-op when GOOS != linux. 1023 func evictFileFromLinuxPageCache(f *os.File, fsync bool, logger zerolog.Logger) error { 1024 err := fadviseNoLinuxPageCache(f.Fd(), fsync) 1025 if err != nil { 1026 return err 1027 } 1028 1029 size := int64(0) 1030 fstat, err := f.Stat() 1031 if err == nil { 1032 size = fstat.Size() 1033 } 1034 1035 logger.Info().Str("filename", f.Name()).Int64("size_mb", size/1024/1024).Msg("evicted file from Linux page cache") 1036 return nil 1037 } 1038 1039 // Copy the checkpoint file including the part files from the given `from` to 1040 // the `to` directory 1041 // it returns the path of all the copied files 1042 // any error returned are exceptions 1043 func CopyCheckpointFile(filename string, from string, to string) ( 1044 []string, 1045 error, 1046 ) { 1047 // It's possible that the trie dir does not yet exist. If not this will create the the required path 1048 err := os.MkdirAll(to, 0700) 1049 if err != nil { 1050 return nil, err 1051 } 1052 1053 // checkpoint V6 produces multiple checkpoint part files that need to be copied over 1054 pattern := filePathPattern(from, filename) 1055 matched, err := filepath.Glob(pattern) 1056 if err != nil { 1057 return nil, fmt.Errorf("could not glob checkpoint file with pattern %v: %w", pattern, err) 1058 } 1059 1060 newPaths := make([]string, len(matched)) 1061 // copy the root checkpoint concurrently 1062 var group errgroup.Group 1063 1064 for i, match := range matched { 1065 _, partfile := filepath.Split(match) 1066 newPath := filepath.Join(to, partfile) 1067 newPaths[i] = newPath 1068 1069 match := match 1070 group.Go(func() error { 1071 err := utilsio.Copy(match, newPath) 1072 if err != nil { 1073 return fmt.Errorf("cannot copy file from %v to %v", match, newPath) 1074 } 1075 return nil 1076 }) 1077 } 1078 1079 err = group.Wait() 1080 if err != nil { 1081 return nil, fmt.Errorf("fail to copy checkpoint files: %w", err) 1082 } 1083 1084 return newPaths, nil 1085 }