github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/ledger/complete/wal/checkpointer.go (about) 1 package wal 2 3 import ( 4 "bufio" 5 "encoding/binary" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "os" 10 "path" 11 "path/filepath" 12 "sort" 13 "strconv" 14 "strings" 15 16 "github.com/docker/go-units" 17 "github.com/rs/zerolog" 18 "github.com/rs/zerolog/log" 19 "golang.org/x/sync/errgroup" 20 21 "github.com/onflow/flow-go/ledger" 22 "github.com/onflow/flow-go/ledger/complete/mtrie" 23 "github.com/onflow/flow-go/ledger/complete/mtrie/flattener" 24 "github.com/onflow/flow-go/ledger/complete/mtrie/node" 25 "github.com/onflow/flow-go/ledger/complete/mtrie/trie" 26 "github.com/onflow/flow-go/model/bootstrap" 27 "github.com/onflow/flow-go/module/metrics" 28 "github.com/onflow/flow-go/module/util" 29 utilsio "github.com/onflow/flow-go/utils/io" 30 ) 31 32 const checkpointFilenamePrefix = "checkpoint." 33 34 const ( 35 MagicBytesCheckpointHeader uint16 = 0x2137 36 MagicBytesCheckpointSubtrie uint16 = 0x2136 37 MagicBytesCheckpointToptrie uint16 = 0x2135 38 MagicBytesPayloadHeader uint16 = 0x2138 39 ) 40 41 const VersionV1 uint16 = 0x01 42 43 // Versions was reset while changing trie format, so now bump it to 3 to avoid conflicts 44 // Version 3 contains a file checksum for detecting corrupted checkpoint files. 45 const VersionV3 uint16 = 0x03 46 47 // Version 4 contains a footer with node count and trie count (previously in the header). 48 // Version 4 also reduces checkpoint data size. See EncodeNode() and EncodeTrie() for more details. 49 const VersionV4 uint16 = 0x04 50 51 // Version 5 includes these changes: 52 // - remove regCount and maxDepth from serialized nodes 53 // - add allocated register count and size to serialized tries 54 // - reduce number of bytes used to encode payload value size from 8 bytes to 4 bytes. 55 // See EncodeNode() and EncodeTrie() for more details. 56 const VersionV5 uint16 = 0x05 57 58 // Version 6 includes these changes: 59 // - trie nodes are stored in additional 17 checkpoint files, with .0, .1, .2, ... .16 as 60 // file name extension 61 const VersionV6 uint16 = 0x06 62 63 // MaxVersion is the latest checkpoint version we support. 64 // Need to update MaxVersion when creating a newer version. 65 const MaxVersion = VersionV6 66 67 const ( 68 encMagicSize = 2 69 encVersionSize = 2 70 headerSize = encMagicSize + encVersionSize 71 encSubtrieCountSize = 2 72 encNodeCountSize = 8 73 encTrieCountSize = 2 74 crc32SumSize = 4 75 ) 76 77 // defaultBufioReadSize replaces the default bufio buffer size of 4096 bytes. 78 // defaultBufioReadSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it 79 // improves performance on typical EN hardware. 80 const defaultBufioReadSize = 1024 * 32 81 82 // defaultBufioWriteSize replaces the default bufio buffer size of 4096 bytes. 83 // defaultBufioWriteSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it 84 // improves performance on typical EN hardware. 85 const defaultBufioWriteSize = 1024 * 32 86 87 type Checkpointer struct { 88 dir string 89 wal *DiskWAL 90 keyByteSize int 91 forestCapacity int 92 } 93 94 func NewCheckpointer(wal *DiskWAL, keyByteSize int, forestCapacity int) *Checkpointer { 95 return &Checkpointer{ 96 dir: wal.wal.Dir(), 97 wal: wal, 98 keyByteSize: keyByteSize, 99 forestCapacity: forestCapacity, 100 } 101 } 102 103 // listCheckpoints returns all the numbers (unsorted) of the checkpoint files, and the number of the last checkpoint. 104 func (c *Checkpointer) listCheckpoints() ([]int, int, error) { 105 return ListCheckpoints(c.dir) 106 } 107 108 // ListCheckpoints returns all the numbers of the checkpoint files, and the number of the last checkpoint. 109 // note, it doesn't include the root checkpoint file 110 func ListCheckpoints(dir string) ([]int, int, error) { 111 list := make([]int, 0) 112 113 files, err := os.ReadDir(dir) 114 if err != nil { 115 return nil, -1, fmt.Errorf("cannot list directory [%s] content: %w", dir, err) 116 } 117 last := -1 118 for _, fn := range files { 119 fname := fn.Name() 120 if !strings.HasPrefix(fname, checkpointFilenamePrefix) { 121 continue 122 } 123 justNumber := fname[len(checkpointFilenamePrefix):] 124 k, err := strconv.Atoi(justNumber) 125 if err != nil { 126 continue 127 } 128 129 list = append(list, k) 130 131 // the last check point is the one with the highest number 132 if k > last { 133 last = k 134 } 135 } 136 137 return list, last, nil 138 } 139 140 // Checkpoints returns all the numbers of the checkpoint files in asc order. 141 // note, it doesn't include the root checkpoint file 142 func (c *Checkpointer) Checkpoints() ([]int, error) { 143 return Checkpoints(c.dir) 144 } 145 146 // Checkpoints returns all the checkpoint numbers in asc order 147 func Checkpoints(dir string) ([]int, error) { 148 list, _, err := ListCheckpoints(dir) 149 if err != nil { 150 return nil, fmt.Errorf("could not fetch all checkpoints: %w", err) 151 } 152 153 sort.Ints(list) 154 155 return list, nil 156 } 157 158 // LatestCheckpoint returns number of latest checkpoint or -1 if there are no checkpoints 159 func (c *Checkpointer) LatestCheckpoint() (int, error) { 160 _, last, err := c.listCheckpoints() 161 return last, err 162 } 163 164 // NotCheckpointedSegments - returns numbers of segments which are not checkpointed yet, 165 // or -1, -1 if there are no segments 166 func (c *Checkpointer) NotCheckpointedSegments() (from, to int, err error) { 167 168 latestCheckpoint, err := c.LatestCheckpoint() 169 if err != nil { 170 return -1, -1, fmt.Errorf("cannot get last checkpoint: %w", err) 171 } 172 173 first, last, err := c.wal.Segments() 174 if err != nil { 175 return -1, -1, fmt.Errorf("cannot get range of segments: %w", err) 176 } 177 178 // there are no segments at all, there is nothing to checkpoint 179 if first == -1 && last == -1 { 180 return -1, -1, nil 181 } 182 183 // no checkpoints 184 if latestCheckpoint == -1 { 185 return first, last, nil 186 } 187 188 // segments before checkpoint 189 if last <= latestCheckpoint { 190 return -1, -1, nil 191 } 192 193 // there is gap between last checkpoint and segments 194 if last > latestCheckpoint && latestCheckpoint < first-1 { 195 return -1, -1, fmt.Errorf("gap between last checkpoint and segments") 196 } 197 198 return latestCheckpoint + 1, last, nil 199 } 200 201 // Checkpoint creates new checkpoint stopping at given segment 202 func (c *Checkpointer) Checkpoint(to int) (err error) { 203 204 _, notCheckpointedTo, err := c.NotCheckpointedSegments() 205 if err != nil { 206 return fmt.Errorf("cannot get not checkpointed segments: %w", err) 207 } 208 209 latestCheckpoint, err := c.LatestCheckpoint() 210 if err != nil { 211 return fmt.Errorf("cannot get latest checkpoint: %w", err) 212 } 213 214 if latestCheckpoint == to { 215 return nil //nothing to do 216 } 217 218 if notCheckpointedTo < to { 219 return fmt.Errorf("no segments to checkpoint to %d, latests not checkpointed segment: %d", to, notCheckpointedTo) 220 } 221 222 forest, err := mtrie.NewForest(c.forestCapacity, &metrics.NoopCollector{}, nil) 223 if err != nil { 224 return fmt.Errorf("cannot create Forest: %w", err) 225 } 226 227 c.wal.log.Info().Msgf("creating checkpoint %d", to) 228 229 err = c.wal.replay(0, to, 230 func(tries []*trie.MTrie) error { 231 return forest.AddTries(tries) 232 }, 233 func(update *ledger.TrieUpdate) error { 234 _, err := forest.Update(update) 235 return err 236 }, func(rootHash ledger.RootHash) error { 237 return nil 238 }, true) 239 240 if err != nil { 241 return fmt.Errorf("cannot replay WAL: %w", err) 242 } 243 244 tries, err := forest.GetTries() 245 if err != nil { 246 return fmt.Errorf("cannot get forest tries: %w", err) 247 } 248 249 c.wal.log.Info().Msgf("serializing checkpoint %d", to) 250 251 fileName := NumberToFilename(to) 252 253 err = StoreCheckpointV6SingleThread(tries, c.wal.dir, fileName, c.wal.log) 254 255 if err != nil { 256 return fmt.Errorf("could not create checkpoint for %v: %w", to, err) 257 } 258 259 checkpointFileSize, err := ReadCheckpointFileSize(c.wal.dir, fileName) 260 if err != nil { 261 return fmt.Errorf("could not read checkpoint file size: %w", err) 262 } 263 264 c.wal.log.Info(). 265 Str("checkpoint_file_size", units.BytesSize(float64(checkpointFileSize))). 266 Msgf("created checkpoint %d with %d tries", to, len(tries)) 267 268 return nil 269 } 270 271 func NumberToFilenamePart(n int) string { 272 return fmt.Sprintf("%08d", n) 273 } 274 275 func NumberToFilename(n int) string { 276 277 return fmt.Sprintf("%s%s", checkpointFilenamePrefix, NumberToFilenamePart(n)) 278 } 279 280 func (c *Checkpointer) CheckpointWriter(to int) (io.WriteCloser, error) { 281 return CreateCheckpointWriterForFile(c.dir, NumberToFilename(to), c.wal.log) 282 } 283 284 func (c *Checkpointer) Dir() string { 285 return c.dir 286 } 287 288 // CreateCheckpointWriterForFile returns a file writer that will write to a temporary file and then move it to the checkpoint folder by renaming it. 289 func CreateCheckpointWriterForFile(dir, filename string, logger zerolog.Logger) (io.WriteCloser, error) { 290 291 fullname := path.Join(dir, filename) 292 293 if utilsio.FileExists(fullname) { 294 return nil, fmt.Errorf("checkpoint file %s already exists", fullname) 295 } 296 297 tmpFile, err := os.CreateTemp(dir, "writing-chkpnt-*") 298 if err != nil { 299 return nil, fmt.Errorf("cannot create temporary file for checkpoint %v: %w", tmpFile, err) 300 } 301 302 writer := bufio.NewWriterSize(tmpFile, defaultBufioWriteSize) 303 return &SyncOnCloseRenameFile{ 304 logger: logger, 305 file: tmpFile, 306 targetName: fullname, 307 Writer: writer, 308 }, nil 309 } 310 311 // StoreCheckpointV5 writes the given tries to checkpoint file, and also appends 312 // a CRC32 file checksum for integrity check. 313 // Checkpoint file consists of a flattened forest. Specifically, it consists of: 314 // - a list of encoded nodes, where references to other nodes are by list index. 315 // - a list of encoded tries, each referencing their respective root node by index. 316 // 317 // Referencing to other nodes by index 0 is a special case, meaning nil. 318 // 319 // As an important property, the nodes are listed in an order which satisfies 320 // Descendents-First-Relationship. The Descendents-First-Relationship has the 321 // following important property: 322 // When rebuilding the trie from the sequence of nodes, build the trie on the fly, 323 // as for each node, the children have been previously encountered. 324 // TODO: evaluate alternatives to CRC32 since checkpoint file is many GB in size. 325 // TODO: add concurrency if the performance gains are enough to offset complexity. 326 func StoreCheckpointV5(dir string, fileName string, logger zerolog.Logger, tries ...*trie.MTrie) ( 327 // error 328 // Note, the above code, which didn't define the name "err" for the returned error, would be wrong, 329 // beause err needs to be defined in order to be updated by the defer function 330 errToReturn error, 331 ) { 332 writer, err := CreateCheckpointWriterForFile(dir, fileName, logger) 333 if err != nil { 334 return fmt.Errorf("could not create writer: %w", err) 335 } 336 defer func() { 337 errToReturn = closeAndMergeError(writer, errToReturn) 338 }() 339 340 crc32Writer := NewCRC32Writer(writer) 341 342 // Scratch buffer is used as temporary buffer that node can encode into. 343 // Data in scratch buffer should be copied or used before scratch buffer is used again. 344 // If the scratch buffer isn't large enough, a new buffer will be allocated. 345 // However, 4096 bytes will be large enough to handle almost all payloads 346 // and 100% of interim nodes. 347 scratch := make([]byte, 1024*4) 348 349 // Write header: magic (2 bytes) + version (2 bytes) 350 header := scratch[:headerSize] 351 binary.BigEndian.PutUint16(header, MagicBytesCheckpointHeader) 352 binary.BigEndian.PutUint16(header[encMagicSize:], VersionV5) 353 354 _, err = crc32Writer.Write(header) 355 if err != nil { 356 return fmt.Errorf("cannot write checkpoint header: %w", err) 357 } 358 359 // Multiple tries might have shared nodes at higher level, However, we don't want to 360 // seralize duplicated nodes in the checkpoint file. In order to deduplicate, we build 361 // a map from unique nodes while iterating and seralizing the nodes to the checkpoint file. 362 // 363 // The map for deduplication contains all the trie nodes, which uses a lot of memory. 364 // In fact, we don't have to build a map for all nodes, since there are nodes which 365 // are never shared. Nodes can only be shared if and only if they are 366 // on the same path. In other words, nodes on different path won't be shared. 367 // If we group trie nodes by path, then we have more smaller groups of trie nodes from the same path, 368 // which might have duplication. And then for each group, we could build a smaller map for deduplication. 369 // Processing each group sequentially would allow us reduce operational memory. 370 // 371 // With this idea in mind, the seralization can be done in two steps: 372 // 1. serialize nodes in subtries (tries with root at subtrieLevel). 373 // 2. serialize remaining nodes (from trie root to subtrie root). 374 // For instance, if there are 3 top tries, and subtrieLevel is 4, then there will be 375 // (2 ^ 4) * 3 = 48 subtrie root nodes at level 4. 376 // Then step 1 will seralize the 48 subtrie root nodes into the checkpoint file, and 377 // then step 2 will seralize the 3 root nodes (level 0) and the interim nodes from level 1 to 3 into 378 // 379 // Step 1: 380 // 1. Find all the subtrie root nodes at subtrieLevel (level 4) 381 // 2. Group the subtrie by path. Since subtries in different group have different path, they won't have 382 // child nodes shared. Subtries in the same group might have duplication, we will build a map to deduplicate. 383 // 384 // subtrieLevel is number of edges from trie root to subtrie root. 385 // Trie root is at level 0. 386 const subtrieLevel = 4 387 388 // subtrieCount is number of subtries at subtrieLevel. 389 const subtrieCount = 1 << subtrieLevel 390 391 // since each trie has `subtrieCount` number of subtries at subtrieLevel, 392 // we create `subtrieCount` number of groups, each group contains all the subtrie root nodes 393 394 // subtrieRoots is an array of groups. 395 // Each group contains the subtrie roots of the same path at subtrieLevel for different tries. 396 // For example, if subtrieLevel is 4, then 397 // - subtrieRoots[0] is a list of all subtrie roots at path [0,0,0,0] 398 // - subtrieRoots[1] is a list of all subtrie roots at path [0,0,0,1] 399 // - subtrieRoots[subtrieCount-1] is a list of all subtrie roots at path [1,1,1,1] 400 // subtrie roots in subtrieRoots[0] have the same path, therefore might have shared child nodes. 401 var subtrieRoots [subtrieCount][]*node.Node 402 for i := 0; i < len(subtrieRoots); i++ { 403 subtrieRoots[i] = make([]*node.Node, len(tries)) 404 } 405 406 for trieIndex, t := range tries { 407 // subtries is an array with subtrieCount trie nodes 408 // in breadth-first order at subtrieLevel of the trie `t` 409 subtries := getNodesAtLevel(t.RootNode(), subtrieLevel) 410 for subtrieIndex, subtrieRoot := range subtries { 411 subtrieRoots[subtrieIndex][trieIndex] = subtrieRoot 412 } 413 } 414 415 // topLevelNodes contains all unique nodes of given tries 416 // from root to subtrie root and their index 417 // (ordered by node traversal sequence). 418 // Index 0 is a special case with nil node. 419 topLevelNodes := make(map[*node.Node]uint64, 1<<(subtrieLevel+1)) 420 topLevelNodes[nil] = 0 421 422 // nodeCounter is counter for all unique nodes. 423 // It starts from 1, as 0 marks nil node. 424 nodeCounter := uint64(1) 425 426 // estimatedSubtrieNodeCount is rough estimate of number of nodes in subtrie, 427 // assuming trie is a full binary tree. estimatedSubtrieNodeCount is used 428 // to preallocate traversedSubtrieNodes for memory efficiency. 429 estimatedSubtrieNodeCount := 0 430 if len(tries) > 0 { 431 estimatedTrieNodeCount := 2*int(tries[0].AllocatedRegCount()) - 1 432 estimatedSubtrieNodeCount = estimatedTrieNodeCount / subtrieCount 433 } 434 435 // Serialize subtrie nodes 436 for i, subTrieRoot := range subtrieRoots { 437 // traversedSubtrieNodes contains all unique nodes of subtries of the same path and their index. 438 traversedSubtrieNodes := make(map[*node.Node]uint64, estimatedSubtrieNodeCount) 439 // Index 0 is a special case with nil node. 440 traversedSubtrieNodes[nil] = 0 441 442 logging := logProgress(fmt.Sprintf("storing %v-th sub trie roots", i), estimatedSubtrieNodeCount, log.Logger) 443 for _, root := range subTrieRoot { 444 // Empty trie is always added to forest as starting point and 445 // empty trie's root is nil. It remains in the forest until evicted 446 // by trie queue exceeding capacity. 447 if root == nil { 448 continue 449 } 450 // Note: nodeCounter is to assign an global index to each node in the order of it being seralized 451 // into the checkpoint file. Therefore, it has to be reused when iterating each subtrie. 452 // storeUniqueNodes will add the unique visited node into traversedSubtrieNodes with key as the node 453 // itself, and value as n-th node being seralized in the checkpoint file. 454 nodeCounter, err = storeUniqueNodes(root, traversedSubtrieNodes, nodeCounter, scratch, crc32Writer, logging) 455 if err != nil { 456 return fmt.Errorf("fail to store nodes in step 1 for subtrie root %v: %w", root.Hash(), err) 457 } 458 // Save subtrie root node index in topLevelNodes, 459 // so when traversing top level tries 460 // (from level 0 to subtrieLevel) using topLevelNodes, 461 // node iterator skips subtrie as visited nodes. 462 topLevelNodes[root] = traversedSubtrieNodes[root] 463 } 464 } 465 466 // Step 2: 467 // Now all nodes above and include the subtrieLevel have been seralized. We now 468 // serialize remaining nodes of each trie from root node (level 0) to (subtrieLevel - 1). 469 for _, t := range tries { 470 root := t.RootNode() 471 if root == nil { 472 continue 473 } 474 // if we iterate through the root trie with an empty visited nodes map, then it will iterate through 475 // all nodes at all levels. In order to skip the nodes above subtrieLevel, since they have been seralized in step 1, 476 // we will need to pass in a visited nodes map that contains all the subtrie root nodes, which is the topLevelNodes. 477 // The topLevelNodes was built in step 1, when seralizing each subtrie root nodes. 478 nodeCounter, err = storeUniqueNodes(root, topLevelNodes, nodeCounter, scratch, crc32Writer, func(uint64) {}) 479 if err != nil { 480 return fmt.Errorf("fail to store nodes in step 2 for root trie %v: %w", root.Hash(), err) 481 } 482 } 483 484 // The root tries are seralized at the end of the checkpoint file, so that it's easy to find what tries are 485 // included. 486 for _, t := range tries { 487 rootNode := t.RootNode() 488 if !t.IsEmpty() && rootNode.Height() != ledger.NodeMaxHeight { 489 return fmt.Errorf("height of root node must be %d, but is %d", 490 ledger.NodeMaxHeight, rootNode.Height()) 491 } 492 493 // Get root node index 494 rootIndex, found := topLevelNodes[rootNode] 495 if !found { 496 rootHash := t.RootHash() 497 return fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(rootHash[:])) 498 } 499 500 encTrie := flattener.EncodeTrie(t, rootIndex, scratch) 501 _, err = crc32Writer.Write(encTrie) 502 if err != nil { 503 return fmt.Errorf("cannot serialize trie: %w", err) 504 } 505 } 506 507 // all trie nodes have been seralized into the checkpoint file, now 508 // write footer with nodes count and tries count. 509 footer := scratch[:encNodeCountSize+encTrieCountSize] 510 binary.BigEndian.PutUint64(footer, nodeCounter-1) // -1 to account for 0 node meaning nil 511 binary.BigEndian.PutUint16(footer[encNodeCountSize:], uint16(len(tries))) 512 513 _, err = crc32Writer.Write(footer) 514 if err != nil { 515 return fmt.Errorf("cannot write checkpoint footer: %w", err) 516 } 517 518 // Write CRC32 sum of the footer for validation 519 crc32buf := scratch[:crc32SumSize] 520 binary.BigEndian.PutUint32(crc32buf, crc32Writer.Crc32()) 521 522 _, err = writer.Write(crc32buf) 523 if err != nil { 524 return fmt.Errorf("cannot write CRC32: %w", err) 525 } 526 527 return nil 528 } 529 530 func logProgress(msg string, estimatedSubtrieNodeCount int, logger zerolog.Logger) func(nodeCounter uint64) { 531 lg := util.LogProgress( 532 logger, 533 util.DefaultLogProgressConfig( 534 msg, 535 estimatedSubtrieNodeCount, 536 ), 537 ) 538 return func(index uint64) { 539 lg(1) 540 } 541 } 542 543 // storeUniqueNodes iterates and serializes unique nodes for trie with given root node. 544 // It also saves unique nodes and node counter in visitedNodes map. 545 // It returns nodeCounter and error (if any). 546 func storeUniqueNodes( 547 root *node.Node, 548 visitedNodes map[*node.Node]uint64, 549 nodeCounter uint64, 550 scratch []byte, 551 writer io.Writer, 552 nodeCounterUpdated func(nodeCounter uint64), // for logging estimated progress 553 ) (uint64, error) { 554 555 for itr := flattener.NewUniqueNodeIterator(root, visitedNodes); itr.Next(); { 556 n := itr.Value() 557 558 visitedNodes[n] = nodeCounter 559 nodeCounter++ 560 nodeCounterUpdated(nodeCounter) 561 562 var lchildIndex, rchildIndex uint64 563 564 if lchild := n.LeftChild(); lchild != nil { 565 var found bool 566 lchildIndex, found = visitedNodes[lchild] 567 if !found { 568 hash := lchild.Hash() 569 return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:])) 570 } 571 } 572 if rchild := n.RightChild(); rchild != nil { 573 var found bool 574 rchildIndex, found = visitedNodes[rchild] 575 if !found { 576 hash := rchild.Hash() 577 return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:])) 578 } 579 } 580 581 encNode := flattener.EncodeNode(n, lchildIndex, rchildIndex, scratch) 582 _, err := writer.Write(encNode) 583 if err != nil { 584 return 0, fmt.Errorf("cannot serialize node: %w", err) 585 } 586 } 587 588 return nodeCounter, nil 589 } 590 591 // getNodesAtLevel returns 2^level nodes at given level in breadth-first order. 592 // It guarantees size and order of returned nodes (nil element if no node at the position). 593 // For example, given nil root and level 3, getNodesAtLevel returns a slice 594 // of 2^3 nil elements. 595 func getNodesAtLevel(root *node.Node, level uint) []*node.Node { 596 nodes := []*node.Node{root} 597 nodesLevel := uint(0) 598 599 // Use breadth first traversal to get all nodes at given level. 600 // If a node isn't found, a nil node is used in its place. 601 for nodesLevel < level { 602 nextLevel := nodesLevel + 1 603 nodesAtNextLevel := make([]*node.Node, 1<<nextLevel) 604 605 for i, n := range nodes { 606 if n != nil { 607 nodesAtNextLevel[i*2] = n.LeftChild() 608 nodesAtNextLevel[i*2+1] = n.RightChild() 609 } 610 } 611 612 nodes = nodesAtNextLevel 613 nodesLevel = nextLevel 614 } 615 616 return nodes 617 } 618 619 func (c *Checkpointer) LoadCheckpoint(checkpoint int) ([]*trie.MTrie, error) { 620 filepath := path.Join(c.dir, NumberToFilename(checkpoint)) 621 return LoadCheckpoint(filepath, c.wal.log) 622 } 623 624 func (c *Checkpointer) LoadRootCheckpoint() ([]*trie.MTrie, error) { 625 filepath := path.Join(c.dir, bootstrap.FilenameWALRootCheckpoint) 626 return LoadCheckpoint(filepath, c.wal.log) 627 } 628 629 func (c *Checkpointer) HasRootCheckpoint() (bool, error) { 630 return HasRootCheckpoint(c.dir) 631 } 632 633 func HasRootCheckpoint(dir string) (bool, error) { 634 if _, err := os.Stat(path.Join(dir, bootstrap.FilenameWALRootCheckpoint)); err == nil { 635 return true, nil 636 } else if os.IsNotExist(err) { 637 return false, nil 638 } else { 639 return false, err 640 } 641 } 642 643 func (c *Checkpointer) RemoveCheckpoint(checkpoint int) error { 644 name := NumberToFilename(checkpoint) 645 return deleteCheckpointFiles(c.dir, name) 646 } 647 648 func LoadCheckpoint(filepath string, logger zerolog.Logger) ( 649 tries []*trie.MTrie, 650 errToReturn error) { 651 file, err := os.Open(filepath) 652 if err != nil { 653 return nil, fmt.Errorf("cannot open checkpoint file %s: %w", filepath, err) 654 } 655 defer func() { 656 evictErr := evictFileFromLinuxPageCache(file, false, logger) 657 if evictErr != nil { 658 logger.Warn().Msgf("failed to evict file %s from Linux page cache: %s", filepath, evictErr) 659 // No need to return this error because it's possible to continue normal operations. 660 } 661 662 errToReturn = closeAndMergeError(file, errToReturn) 663 }() 664 665 return readCheckpoint(file, logger) 666 } 667 668 func readCheckpoint(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) { 669 670 // Read header: magic (2 bytes) + version (2 bytes) 671 header := make([]byte, headerSize) 672 _, err := io.ReadFull(f, header) 673 if err != nil { 674 return nil, fmt.Errorf("cannot read header: %w", err) 675 } 676 677 // Decode header 678 magicBytes := binary.BigEndian.Uint16(header) 679 version := binary.BigEndian.Uint16(header[encMagicSize:]) 680 681 // Reset offset 682 _, err = f.Seek(0, io.SeekStart) 683 if err != nil { 684 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 685 } 686 687 if magicBytes != MagicBytesCheckpointHeader { 688 return nil, fmt.Errorf("unknown file format. Magic constant %x does not match expected %x", magicBytes, MagicBytesCheckpointHeader) 689 } 690 691 switch version { 692 case VersionV1, VersionV3: 693 return readCheckpointV3AndEarlier(f, version) 694 case VersionV4: 695 return readCheckpointV4(f) 696 case VersionV5: 697 return readCheckpointV5(f, logger) 698 case VersionV6: 699 return readCheckpointV6(f, logger) 700 default: 701 return nil, fmt.Errorf("unsupported file version %x", version) 702 } 703 } 704 705 type nodeWithRegMetrics struct { 706 n *node.Node 707 regCount uint64 708 regSize uint64 709 } 710 711 // readCheckpointV3AndEarlier deserializes checkpoint file (version 3 and earlier) and returns a list of tries. 712 // Header (magic and version) is verified by the caller. 713 // This function is for backwards compatibility, not optimized. 714 func readCheckpointV3AndEarlier(f *os.File, version uint16) ([]*trie.MTrie, error) { 715 716 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 717 crcReader := NewCRC32Reader(bufReader) 718 719 var reader io.Reader 720 721 if version != VersionV3 { 722 reader = bufReader 723 } else { 724 reader = crcReader 725 } 726 727 // Read header (magic + version), node count, and trie count. 728 header := make([]byte, headerSize+encNodeCountSize+encTrieCountSize) 729 730 _, err := io.ReadFull(reader, header) 731 if err != nil { 732 return nil, fmt.Errorf("cannot read header: %w", err) 733 } 734 735 // Magic and version are verified by the caller. 736 737 // Decode node count and trie count 738 nodesCount := binary.BigEndian.Uint64(header[headerSize:]) 739 triesCount := binary.BigEndian.Uint16(header[headerSize+encNodeCountSize:]) 740 741 nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil 742 tries := make([]*trie.MTrie, triesCount) 743 744 for i := uint64(1); i <= nodesCount; i++ { 745 n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 746 if nodeIndex >= uint64(i) { 747 return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship") 748 } 749 nm := nodes[nodeIndex] 750 return nm.n, nm.regCount, nm.regSize, nil 751 }) 752 if err != nil { 753 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 754 } 755 nodes[i].n = n 756 nodes[i].regCount = regCount 757 nodes[i].regSize = regSize 758 } 759 760 for i := uint16(0); i < triesCount; i++ { 761 trie, err := flattener.ReadTrieFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 762 if nodeIndex >= uint64(len(nodes)) { 763 return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node") 764 } 765 nm := nodes[nodeIndex] 766 return nm.n, nm.regCount, nm.regSize, nil 767 }) 768 if err != nil { 769 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 770 } 771 tries[i] = trie 772 } 773 774 if version == VersionV3 { 775 crc32buf := make([]byte, crc32SumSize) 776 777 _, err := io.ReadFull(bufReader, crc32buf) 778 if err != nil { 779 return nil, fmt.Errorf("cannot read CRC32: %w", err) 780 } 781 782 readCrc32 := binary.BigEndian.Uint32(crc32buf) 783 784 calculatedCrc32 := crcReader.Crc32() 785 786 if calculatedCrc32 != readCrc32 { 787 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 788 } 789 } 790 791 return tries, nil 792 } 793 794 // readCheckpointV4 decodes checkpoint file (version 4) and returns a list of tries. 795 // Header (magic and version) is verified by the caller. 796 // This function is for backwards compatibility. 797 func readCheckpointV4(f *os.File) ([]*trie.MTrie, error) { 798 799 // Scratch buffer is used as temporary buffer that reader can read into. 800 // Raw data in scratch buffer should be copied or converted into desired 801 // objects before next Read operation. If the scratch buffer isn't large 802 // enough, a new buffer will be allocated. However, 4096 bytes will 803 // be large enough to handle almost all payloads and 100% of interim nodes. 804 scratch := make([]byte, 1024*4) // must not be less than 1024 805 806 // Read footer to get node count and trie count 807 808 // footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes) 809 const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize 810 const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum 811 812 // Seek to footer 813 _, err := f.Seek(-footerOffset, io.SeekEnd) 814 if err != nil { 815 return nil, fmt.Errorf("cannot seek to footer: %w", err) 816 } 817 818 footer := scratch[:footerSize] 819 820 _, err = io.ReadFull(f, footer) 821 if err != nil { 822 return nil, fmt.Errorf("cannot read footer: %w", err) 823 } 824 825 // Decode node count and trie count 826 nodesCount := binary.BigEndian.Uint64(footer) 827 triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:]) 828 829 // Seek to the start of file 830 _, err = f.Seek(0, io.SeekStart) 831 if err != nil { 832 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 833 } 834 835 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 836 crcReader := NewCRC32Reader(bufReader) 837 var reader io.Reader = crcReader 838 839 // Read header: magic (2 bytes) + version (2 bytes) 840 // No action is needed for header because it is verified by the caller. 841 842 _, err = io.ReadFull(reader, scratch[:headerSize]) 843 if err != nil { 844 return nil, fmt.Errorf("cannot read header: %w", err) 845 } 846 847 // nodes's element at index 0 is a special, meaning nil . 848 nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil 849 tries := make([]*trie.MTrie, triesCount) 850 851 for i := uint64(1); i <= nodesCount; i++ { 852 n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 853 if nodeIndex >= uint64(i) { 854 return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship") 855 } 856 nm := nodes[nodeIndex] 857 return nm.n, nm.regCount, nm.regSize, nil 858 }) 859 if err != nil { 860 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 861 } 862 nodes[i].n = n 863 nodes[i].regCount = regCount 864 nodes[i].regSize = regSize 865 } 866 867 for i := uint16(0); i < triesCount; i++ { 868 trie, err := flattener.ReadTrieFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) { 869 if nodeIndex >= uint64(len(nodes)) { 870 return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node") 871 } 872 nm := nodes[nodeIndex] 873 return nm.n, nm.regCount, nm.regSize, nil 874 }) 875 if err != nil { 876 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 877 } 878 tries[i] = trie 879 } 880 881 // Read footer again for crc32 computation 882 // No action is needed. 883 _, err = io.ReadFull(reader, footer) 884 if err != nil { 885 return nil, fmt.Errorf("cannot read footer: %w", err) 886 } 887 888 // Read CRC32 889 crc32buf := scratch[:crc32SumSize] 890 _, err = io.ReadFull(bufReader, crc32buf) 891 if err != nil { 892 return nil, fmt.Errorf("cannot read CRC32: %w", err) 893 } 894 895 readCrc32 := binary.BigEndian.Uint32(crc32buf) 896 897 calculatedCrc32 := crcReader.Crc32() 898 899 if calculatedCrc32 != readCrc32 { 900 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 901 } 902 903 return tries, nil 904 } 905 906 // readCheckpointV5 decodes checkpoint file (version 5) and returns a list of tries. 907 // Checkpoint file header (magic and version) are verified by the caller. 908 func readCheckpointV5(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) { 909 logger.Info().Msgf("reading v5 checkpoint file") 910 911 // Scratch buffer is used as temporary buffer that reader can read into. 912 // Raw data in scratch buffer should be copied or converted into desired 913 // objects before next Read operation. If the scratch buffer isn't large 914 // enough, a new buffer will be allocated. However, 4096 bytes will 915 // be large enough to handle almost all payloads and 100% of interim nodes. 916 scratch := make([]byte, 1024*4) // must not be less than 1024 917 918 // Read footer to get node count and trie count 919 920 // footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes) 921 const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize 922 const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum 923 924 // Seek to footer 925 _, err := f.Seek(-footerOffset, io.SeekEnd) 926 if err != nil { 927 return nil, fmt.Errorf("cannot seek to footer: %w", err) 928 } 929 930 footer := scratch[:footerSize] 931 932 _, err = io.ReadFull(f, footer) 933 if err != nil { 934 return nil, fmt.Errorf("cannot read footer: %w", err) 935 } 936 937 // Decode node count and trie count 938 nodesCount := binary.BigEndian.Uint64(footer) 939 triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:]) 940 941 // Seek to the start of file 942 _, err = f.Seek(0, io.SeekStart) 943 if err != nil { 944 return nil, fmt.Errorf("cannot seek to start of file: %w", err) 945 } 946 947 var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize) 948 crcReader := NewCRC32Reader(bufReader) 949 var reader io.Reader = crcReader 950 951 // Read header: magic (2 bytes) + version (2 bytes) 952 // No action is needed for header because it is verified by the caller. 953 954 _, err = io.ReadFull(reader, scratch[:headerSize]) 955 if err != nil { 956 return nil, fmt.Errorf("cannot read header: %w", err) 957 } 958 959 // nodes's element at index 0 is a special, meaning nil . 960 nodes := make([]*node.Node, nodesCount+1) //+1 for 0 index meaning nil 961 tries := make([]*trie.MTrie, triesCount) 962 963 logging := logProgress("reading trie nodes", int(nodesCount), logger) 964 965 for i := uint64(1); i <= nodesCount; i++ { 966 n, err := flattener.ReadNode(reader, scratch, func(nodeIndex uint64) (*node.Node, error) { 967 if nodeIndex >= uint64(i) { 968 return nil, fmt.Errorf("sequence of serialized nodes does not satisfy Descendents-First-Relationship") 969 } 970 return nodes[nodeIndex], nil 971 }) 972 if err != nil { 973 return nil, fmt.Errorf("cannot read node %d: %w", i, err) 974 } 975 nodes[i] = n 976 logging(i) 977 } 978 979 logger.Info().Msgf("finished loading %v trie nodes, start loading %v tries", nodesCount, triesCount) 980 981 for i := uint16(0); i < triesCount; i++ { 982 trie, err := flattener.ReadTrie(reader, scratch, func(nodeIndex uint64) (*node.Node, error) { 983 if nodeIndex >= uint64(len(nodes)) { 984 return nil, fmt.Errorf("sequence of stored nodes doesn't contain node") 985 } 986 return nodes[nodeIndex], nil 987 }) 988 if err != nil { 989 return nil, fmt.Errorf("cannot read trie %d: %w", i, err) 990 } 991 tries[i] = trie 992 } 993 994 // Read footer again for crc32 computation 995 // No action is needed. 996 _, err = io.ReadFull(reader, footer) 997 if err != nil { 998 return nil, fmt.Errorf("cannot read footer: %w", err) 999 } 1000 1001 // Read CRC32 1002 crc32buf := scratch[:crc32SumSize] 1003 _, err = io.ReadFull(bufReader, crc32buf) 1004 if err != nil { 1005 return nil, fmt.Errorf("cannot read CRC32: %w", err) 1006 } 1007 1008 readCrc32 := binary.BigEndian.Uint32(crc32buf) 1009 1010 calculatedCrc32 := crcReader.Crc32() 1011 1012 if calculatedCrc32 != readCrc32 { 1013 return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32) 1014 } 1015 1016 return tries, nil 1017 } 1018 1019 // evictFileFromLinuxPageCache advises Linux to evict a file from Linux page cache. 1020 // A use case is when a new checkpoint is loaded or created, Linux may cache big 1021 // checkpoint files in memory until evictFileFromLinuxPageCache causes them to be 1022 // evicted from the Linux page cache. Not calling eviceFileFromLinuxPageCache() 1023 // causes two checkpoint files to be cached for each checkpointing, eventually 1024 // caching hundreds of GB. 1025 // CAUTION: no-op when GOOS != linux. 1026 func evictFileFromLinuxPageCache(f *os.File, fsync bool, logger zerolog.Logger) error { 1027 err := fadviseNoLinuxPageCache(f.Fd(), fsync) 1028 if err != nil { 1029 return err 1030 } 1031 1032 size := int64(0) 1033 fstat, err := f.Stat() 1034 if err == nil { 1035 size = fstat.Size() 1036 } 1037 1038 logger.Info().Str("filename", f.Name()).Int64("size_mb", size/1024/1024).Msg("evicted file from Linux page cache") 1039 return nil 1040 } 1041 1042 // Copy the checkpoint file including the part files from the given `from` to 1043 // the `to` directory 1044 // it returns the path of all the copied files 1045 // any error returned are exceptions 1046 func CopyCheckpointFile(filename string, from string, to string) ( 1047 []string, 1048 error, 1049 ) { 1050 // It's possible that the trie dir does not yet exist. If not this will create the the required path 1051 err := os.MkdirAll(to, 0700) 1052 if err != nil { 1053 return nil, err 1054 } 1055 1056 // checkpoint V6 produces multiple checkpoint part files that need to be copied over 1057 pattern := filePathPattern(from, filename) 1058 matched, err := filepath.Glob(pattern) 1059 if err != nil { 1060 return nil, fmt.Errorf("could not glob checkpoint file with pattern %v: %w", pattern, err) 1061 } 1062 1063 newPaths := make([]string, len(matched)) 1064 // copy the root checkpoint concurrently 1065 var group errgroup.Group 1066 1067 for i, match := range matched { 1068 _, partfile := filepath.Split(match) 1069 newPath := filepath.Join(to, partfile) 1070 newPaths[i] = newPath 1071 1072 match := match 1073 group.Go(func() error { 1074 err := utilsio.Copy(match, newPath) 1075 if err != nil { 1076 return fmt.Errorf("cannot copy file from %v to %v", match, newPath) 1077 } 1078 return nil 1079 }) 1080 } 1081 1082 err = group.Wait() 1083 if err != nil { 1084 return nil, fmt.Errorf("fail to copy checkpoint files: %w", err) 1085 } 1086 1087 return newPaths, nil 1088 }