github.com/onflow/flow-go@v0.33.17/ledger/complete/wal/checkpointer.go (about)

     1  package wal
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/binary"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"path"
    11  	"path/filepath"
    12  	"sort"
    13  	"strconv"
    14  	"strings"
    15  
    16  	"github.com/docker/go-units"
    17  	"github.com/rs/zerolog"
    18  	"github.com/rs/zerolog/log"
    19  	"golang.org/x/sync/errgroup"
    20  
    21  	"github.com/onflow/flow-go/ledger"
    22  	"github.com/onflow/flow-go/ledger/complete/mtrie"
    23  	"github.com/onflow/flow-go/ledger/complete/mtrie/flattener"
    24  	"github.com/onflow/flow-go/ledger/complete/mtrie/node"
    25  	"github.com/onflow/flow-go/ledger/complete/mtrie/trie"
    26  	"github.com/onflow/flow-go/model/bootstrap"
    27  	"github.com/onflow/flow-go/module/metrics"
    28  	"github.com/onflow/flow-go/module/util"
    29  	utilsio "github.com/onflow/flow-go/utils/io"
    30  )
    31  
    32  const checkpointFilenamePrefix = "checkpoint."
    33  
    34  const MagicBytesCheckpointHeader uint16 = 0x2137
    35  const MagicBytesCheckpointSubtrie uint16 = 0x2136
    36  const MagicBytesCheckpointToptrie uint16 = 0x2135
    37  
    38  const VersionV1 uint16 = 0x01
    39  
    40  // Versions was reset while changing trie format, so now bump it to 3 to avoid conflicts
    41  // Version 3 contains a file checksum for detecting corrupted checkpoint files.
    42  const VersionV3 uint16 = 0x03
    43  
    44  // Version 4 contains a footer with node count and trie count (previously in the header).
    45  // Version 4 also reduces checkpoint data size.  See EncodeNode() and EncodeTrie() for more details.
    46  const VersionV4 uint16 = 0x04
    47  
    48  // Version 5 includes these changes:
    49  // - remove regCount and maxDepth from serialized nodes
    50  // - add allocated register count and size to serialized tries
    51  // - reduce number of bytes used to encode payload value size from 8 bytes to 4 bytes.
    52  // See EncodeNode() and EncodeTrie() for more details.
    53  const VersionV5 uint16 = 0x05
    54  
    55  // Version 6 includes these changes:
    56  //   - trie nodes are stored in additional 17 checkpoint files, with .0, .1, .2, ... .16 as
    57  //     file name extension
    58  const VersionV6 uint16 = 0x06
    59  
    60  // MaxVersion is the latest checkpoint version we support.
    61  // Need to update MaxVersion when creating a newer version.
    62  const MaxVersion = VersionV6
    63  
    64  const (
    65  	encMagicSize        = 2
    66  	encVersionSize      = 2
    67  	headerSize          = encMagicSize + encVersionSize
    68  	encSubtrieCountSize = 2
    69  	encNodeCountSize    = 8
    70  	encTrieCountSize    = 2
    71  	crc32SumSize        = 4
    72  )
    73  
    74  // defaultBufioReadSize replaces the default bufio buffer size of 4096 bytes.
    75  // defaultBufioReadSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it
    76  // improves performance on typical EN hardware.
    77  const defaultBufioReadSize = 1024 * 32
    78  
    79  // defaultBufioWriteSize replaces the default bufio buffer size of 4096 bytes.
    80  // defaultBufioWriteSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it
    81  // improves performance on typical EN hardware.
    82  const defaultBufioWriteSize = 1024 * 32
    83  
    84  type Checkpointer struct {
    85  	dir            string
    86  	wal            *DiskWAL
    87  	keyByteSize    int
    88  	forestCapacity int
    89  }
    90  
    91  func NewCheckpointer(wal *DiskWAL, keyByteSize int, forestCapacity int) *Checkpointer {
    92  	return &Checkpointer{
    93  		dir:            wal.wal.Dir(),
    94  		wal:            wal,
    95  		keyByteSize:    keyByteSize,
    96  		forestCapacity: forestCapacity,
    97  	}
    98  }
    99  
   100  // listCheckpoints returns all the numbers (unsorted) of the checkpoint files, and the number of the last checkpoint.
   101  func (c *Checkpointer) listCheckpoints() ([]int, int, error) {
   102  	return ListCheckpoints(c.dir)
   103  }
   104  
   105  // ListCheckpoints returns all the numbers of the checkpoint files, and the number of the last checkpoint.
   106  // note, it doesn't include the root checkpoint file
   107  func ListCheckpoints(dir string) ([]int, int, error) {
   108  	list := make([]int, 0)
   109  
   110  	files, err := os.ReadDir(dir)
   111  	if err != nil {
   112  		return nil, -1, fmt.Errorf("cannot list directory [%s] content: %w", dir, err)
   113  	}
   114  	last := -1
   115  	for _, fn := range files {
   116  		fname := fn.Name()
   117  		if !strings.HasPrefix(fname, checkpointFilenamePrefix) {
   118  			continue
   119  		}
   120  		justNumber := fname[len(checkpointFilenamePrefix):]
   121  		k, err := strconv.Atoi(justNumber)
   122  		if err != nil {
   123  			continue
   124  		}
   125  
   126  		list = append(list, k)
   127  
   128  		// the last check point is the one with the highest number
   129  		if k > last {
   130  			last = k
   131  		}
   132  	}
   133  
   134  	return list, last, nil
   135  }
   136  
   137  // Checkpoints returns all the numbers of the checkpoint files in asc order.
   138  // note, it doesn't include the root checkpoint file
   139  func (c *Checkpointer) Checkpoints() ([]int, error) {
   140  	return Checkpoints(c.dir)
   141  }
   142  
   143  // Checkpoints returns all the checkpoint numbers in asc order
   144  func Checkpoints(dir string) ([]int, error) {
   145  	list, _, err := ListCheckpoints(dir)
   146  	if err != nil {
   147  		return nil, fmt.Errorf("could not fetch all checkpoints: %w", err)
   148  	}
   149  
   150  	sort.Ints(list)
   151  
   152  	return list, nil
   153  }
   154  
   155  // LatestCheckpoint returns number of latest checkpoint or -1 if there are no checkpoints
   156  func (c *Checkpointer) LatestCheckpoint() (int, error) {
   157  	_, last, err := c.listCheckpoints()
   158  	return last, err
   159  }
   160  
   161  // NotCheckpointedSegments - returns numbers of segments which are not checkpointed yet,
   162  // or -1, -1 if there are no segments
   163  func (c *Checkpointer) NotCheckpointedSegments() (from, to int, err error) {
   164  
   165  	latestCheckpoint, err := c.LatestCheckpoint()
   166  	if err != nil {
   167  		return -1, -1, fmt.Errorf("cannot get last checkpoint: %w", err)
   168  	}
   169  
   170  	first, last, err := c.wal.Segments()
   171  	if err != nil {
   172  		return -1, -1, fmt.Errorf("cannot get range of segments: %w", err)
   173  	}
   174  
   175  	// there are no segments at all, there is nothing to checkpoint
   176  	if first == -1 && last == -1 {
   177  		return -1, -1, nil
   178  	}
   179  
   180  	// no checkpoints
   181  	if latestCheckpoint == -1 {
   182  		return first, last, nil
   183  	}
   184  
   185  	// segments before checkpoint
   186  	if last <= latestCheckpoint {
   187  		return -1, -1, nil
   188  	}
   189  
   190  	// there is gap between last checkpoint and segments
   191  	if last > latestCheckpoint && latestCheckpoint < first-1 {
   192  		return -1, -1, fmt.Errorf("gap between last checkpoint and segments")
   193  	}
   194  
   195  	return latestCheckpoint + 1, last, nil
   196  }
   197  
   198  // Checkpoint creates new checkpoint stopping at given segment
   199  func (c *Checkpointer) Checkpoint(to int) (err error) {
   200  
   201  	_, notCheckpointedTo, err := c.NotCheckpointedSegments()
   202  	if err != nil {
   203  		return fmt.Errorf("cannot get not checkpointed segments: %w", err)
   204  	}
   205  
   206  	latestCheckpoint, err := c.LatestCheckpoint()
   207  	if err != nil {
   208  		return fmt.Errorf("cannot get latest checkpoint: %w", err)
   209  	}
   210  
   211  	if latestCheckpoint == to {
   212  		return nil //nothing to do
   213  	}
   214  
   215  	if notCheckpointedTo < to {
   216  		return fmt.Errorf("no segments to checkpoint to %d, latests not checkpointed segment: %d", to, notCheckpointedTo)
   217  	}
   218  
   219  	forest, err := mtrie.NewForest(c.forestCapacity, &metrics.NoopCollector{}, nil)
   220  	if err != nil {
   221  		return fmt.Errorf("cannot create Forest: %w", err)
   222  	}
   223  
   224  	c.wal.log.Info().Msgf("creating checkpoint %d", to)
   225  
   226  	err = c.wal.replay(0, to,
   227  		func(tries []*trie.MTrie) error {
   228  			return forest.AddTries(tries)
   229  		},
   230  		func(update *ledger.TrieUpdate) error {
   231  			_, err := forest.Update(update)
   232  			return err
   233  		}, func(rootHash ledger.RootHash) error {
   234  			return nil
   235  		}, true)
   236  
   237  	if err != nil {
   238  		return fmt.Errorf("cannot replay WAL: %w", err)
   239  	}
   240  
   241  	tries, err := forest.GetTries()
   242  	if err != nil {
   243  		return fmt.Errorf("cannot get forest tries: %w", err)
   244  	}
   245  
   246  	c.wal.log.Info().Msgf("serializing checkpoint %d", to)
   247  
   248  	fileName := NumberToFilename(to)
   249  
   250  	err = StoreCheckpointV6SingleThread(tries, c.wal.dir, fileName, c.wal.log)
   251  
   252  	if err != nil {
   253  		return fmt.Errorf("could not create checkpoint for %v: %w", to, err)
   254  	}
   255  
   256  	checkpointFileSize, err := ReadCheckpointFileSize(c.wal.dir, fileName)
   257  	if err != nil {
   258  		return fmt.Errorf("could not read checkpoint file size: %w", err)
   259  	}
   260  
   261  	c.wal.log.Info().
   262  		Str("checkpoint_file_size", units.BytesSize(float64(checkpointFileSize))).
   263  		Msgf("created checkpoint %d with %d tries", to, len(tries))
   264  
   265  	return nil
   266  }
   267  
   268  func NumberToFilenamePart(n int) string {
   269  	return fmt.Sprintf("%08d", n)
   270  }
   271  
   272  func NumberToFilename(n int) string {
   273  
   274  	return fmt.Sprintf("%s%s", checkpointFilenamePrefix, NumberToFilenamePart(n))
   275  }
   276  
   277  func (c *Checkpointer) CheckpointWriter(to int) (io.WriteCloser, error) {
   278  	return CreateCheckpointWriterForFile(c.dir, NumberToFilename(to), c.wal.log)
   279  }
   280  
   281  func (c *Checkpointer) Dir() string {
   282  	return c.dir
   283  }
   284  
   285  // CreateCheckpointWriterForFile returns a file writer that will write to a temporary file and then move it to the checkpoint folder by renaming it.
   286  func CreateCheckpointWriterForFile(dir, filename string, logger zerolog.Logger) (io.WriteCloser, error) {
   287  
   288  	fullname := path.Join(dir, filename)
   289  
   290  	if utilsio.FileExists(fullname) {
   291  		return nil, fmt.Errorf("checkpoint file %s already exists", fullname)
   292  	}
   293  
   294  	tmpFile, err := os.CreateTemp(dir, "writing-chkpnt-*")
   295  	if err != nil {
   296  		return nil, fmt.Errorf("cannot create temporary file for checkpoint %v: %w", tmpFile, err)
   297  	}
   298  
   299  	writer := bufio.NewWriterSize(tmpFile, defaultBufioWriteSize)
   300  	return &SyncOnCloseRenameFile{
   301  		logger:     logger,
   302  		file:       tmpFile,
   303  		targetName: fullname,
   304  		Writer:     writer,
   305  	}, nil
   306  }
   307  
   308  // StoreCheckpointV5 writes the given tries to checkpoint file, and also appends
   309  // a CRC32 file checksum for integrity check.
   310  // Checkpoint file consists of a flattened forest. Specifically, it consists of:
   311  //   - a list of encoded nodes, where references to other nodes are by list index.
   312  //   - a list of encoded tries, each referencing their respective root node by index.
   313  //
   314  // Referencing to other nodes by index 0 is a special case, meaning nil.
   315  //
   316  // As an important property, the nodes are listed in an order which satisfies
   317  // Descendents-First-Relationship. The Descendents-First-Relationship has the
   318  // following important property:
   319  // When rebuilding the trie from the sequence of nodes, build the trie on the fly,
   320  // as for each node, the children have been previously encountered.
   321  // TODO: evaluate alternatives to CRC32 since checkpoint file is many GB in size.
   322  // TODO: add concurrency if the performance gains are enough to offset complexity.
   323  func StoreCheckpointV5(dir string, fileName string, logger zerolog.Logger, tries ...*trie.MTrie) (
   324  	// error
   325  	// Note, the above code, which didn't define the name "err" for the returned error, would be wrong,
   326  	// beause err needs to be defined in order to be updated by the defer function
   327  	errToReturn error,
   328  ) {
   329  	writer, err := CreateCheckpointWriterForFile(dir, fileName, logger)
   330  	if err != nil {
   331  		return fmt.Errorf("could not create writer: %w", err)
   332  	}
   333  	defer func() {
   334  		errToReturn = closeAndMergeError(writer, errToReturn)
   335  	}()
   336  
   337  	crc32Writer := NewCRC32Writer(writer)
   338  
   339  	// Scratch buffer is used as temporary buffer that node can encode into.
   340  	// Data in scratch buffer should be copied or used before scratch buffer is used again.
   341  	// If the scratch buffer isn't large enough, a new buffer will be allocated.
   342  	// However, 4096 bytes will be large enough to handle almost all payloads
   343  	// and 100% of interim nodes.
   344  	scratch := make([]byte, 1024*4)
   345  
   346  	// Write header: magic (2 bytes) + version (2 bytes)
   347  	header := scratch[:headerSize]
   348  	binary.BigEndian.PutUint16(header, MagicBytesCheckpointHeader)
   349  	binary.BigEndian.PutUint16(header[encMagicSize:], VersionV5)
   350  
   351  	_, err = crc32Writer.Write(header)
   352  	if err != nil {
   353  		return fmt.Errorf("cannot write checkpoint header: %w", err)
   354  	}
   355  
   356  	// Multiple tries might have shared nodes at higher level, However, we don't want to
   357  	// seralize duplicated nodes in the checkpoint file. In order to deduplicate, we build
   358  	// a map from unique nodes while iterating and seralizing the nodes to the checkpoint file.
   359  	//
   360  	// The map for deduplication contains all the trie nodes, which uses a lot of memory.
   361  	// In fact, we don't have to build a map for all nodes, since there are nodes which
   362  	// are never shared.  Nodes can only be shared if and only if they are
   363  	// on the same path. In other words, nodes on different path won't be shared.
   364  	// If we group trie nodes by path, then we have more smaller groups of trie nodes from the same path,
   365  	// which might have duplication. And then for each group, we could build a smaller map for deduplication.
   366  	// Processing each group sequentially would allow us reduce operational memory.
   367  	//
   368  	// With this idea in mind, the seralization can be done in two steps:
   369  	// 1. serialize nodes in subtries (tries with root at subtrieLevel).
   370  	// 2. serialize remaining nodes (from trie root to subtrie root).
   371  	// For instance, if there are 3 top tries, and subtrieLevel is 4, then there will be
   372  	// 	(2 ^ 4) * 3 = 48 subtrie root nodes at level 4.
   373  	// Then step 1 will seralize the 48 subtrie root nodes into the checkpoint file, and
   374  	// then step 2 will seralize the 3 root nodes (level 0) and the interim nodes from level 1 to 3 into
   375  	//
   376  	// Step 1:
   377  	// 1. Find all the subtrie root nodes at subtrieLevel (level 4)
   378  	// 2. Group the subtrie by path. Since subtries in different group have different path, they won't have
   379  	//		child nodes shared. Subtries in the same group might have duplication, we will build a map to deduplicate.
   380  	//
   381  	// subtrieLevel is number of edges from trie root to subtrie root.
   382  	// Trie root is at level 0.
   383  	const subtrieLevel = 4
   384  
   385  	// subtrieCount is number of subtries at subtrieLevel.
   386  	const subtrieCount = 1 << subtrieLevel
   387  
   388  	// since each trie has `subtrieCount` number of subtries at subtrieLevel,
   389  	// we create `subtrieCount` number of groups, each group contains all the subtrie root nodes
   390  
   391  	// subtrieRoots is an array of groups.
   392  	// Each group contains the subtrie roots of the same path at subtrieLevel for different tries.
   393  	// For example, if subtrieLevel is 4, then
   394  	// - subtrieRoots[0] is a list of all subtrie roots at path [0,0,0,0]
   395  	// - subtrieRoots[1] is a list of all subtrie roots at path [0,0,0,1]
   396  	// - subtrieRoots[subtrieCount-1] is a list of all subtrie roots at path [1,1,1,1]
   397  	// subtrie roots in subtrieRoots[0] have the same path, therefore might have shared child nodes.
   398  	var subtrieRoots [subtrieCount][]*node.Node
   399  	for i := 0; i < len(subtrieRoots); i++ {
   400  		subtrieRoots[i] = make([]*node.Node, len(tries))
   401  	}
   402  
   403  	for trieIndex, t := range tries {
   404  		// subtries is an array with subtrieCount trie nodes
   405  		// in breadth-first order at subtrieLevel of the trie `t`
   406  		subtries := getNodesAtLevel(t.RootNode(), subtrieLevel)
   407  		for subtrieIndex, subtrieRoot := range subtries {
   408  			subtrieRoots[subtrieIndex][trieIndex] = subtrieRoot
   409  		}
   410  	}
   411  
   412  	// topLevelNodes contains all unique nodes of given tries
   413  	// from root to subtrie root and their index
   414  	// (ordered by node traversal sequence).
   415  	// Index 0 is a special case with nil node.
   416  	topLevelNodes := make(map[*node.Node]uint64, 1<<(subtrieLevel+1))
   417  	topLevelNodes[nil] = 0
   418  
   419  	// nodeCounter is counter for all unique nodes.
   420  	// It starts from 1, as 0 marks nil node.
   421  	nodeCounter := uint64(1)
   422  
   423  	// estimatedSubtrieNodeCount is rough estimate of number of nodes in subtrie,
   424  	// assuming trie is a full binary tree.  estimatedSubtrieNodeCount is used
   425  	// to preallocate traversedSubtrieNodes for memory efficiency.
   426  	estimatedSubtrieNodeCount := 0
   427  	if len(tries) > 0 {
   428  		estimatedTrieNodeCount := 2*int(tries[0].AllocatedRegCount()) - 1
   429  		estimatedSubtrieNodeCount = estimatedTrieNodeCount / subtrieCount
   430  	}
   431  
   432  	// Serialize subtrie nodes
   433  	for i, subTrieRoot := range subtrieRoots {
   434  		// traversedSubtrieNodes contains all unique nodes of subtries of the same path and their index.
   435  		traversedSubtrieNodes := make(map[*node.Node]uint64, estimatedSubtrieNodeCount)
   436  		// Index 0 is a special case with nil node.
   437  		traversedSubtrieNodes[nil] = 0
   438  
   439  		logging := logProgress(fmt.Sprintf("storing %v-th sub trie roots", i), estimatedSubtrieNodeCount, log.Logger)
   440  		for _, root := range subTrieRoot {
   441  			// Empty trie is always added to forest as starting point and
   442  			// empty trie's root is nil. It remains in the forest until evicted
   443  			// by trie queue exceeding capacity.
   444  			if root == nil {
   445  				continue
   446  			}
   447  			// Note: nodeCounter is to assign an global index to each node in the order of it being seralized
   448  			// into the checkpoint file. Therefore, it has to be reused when iterating each subtrie.
   449  			// storeUniqueNodes will add the unique visited node into traversedSubtrieNodes with key as the node
   450  			// itself, and value as n-th node being seralized in the checkpoint file.
   451  			nodeCounter, err = storeUniqueNodes(root, traversedSubtrieNodes, nodeCounter, scratch, crc32Writer, logging)
   452  			if err != nil {
   453  				return fmt.Errorf("fail to store nodes in step 1 for subtrie root %v: %w", root.Hash(), err)
   454  			}
   455  			// Save subtrie root node index in topLevelNodes,
   456  			// so when traversing top level tries
   457  			// (from level 0 to subtrieLevel) using topLevelNodes,
   458  			// node iterator skips subtrie as visited nodes.
   459  			topLevelNodes[root] = traversedSubtrieNodes[root]
   460  		}
   461  	}
   462  
   463  	// Step 2:
   464  	// Now all nodes above and include the subtrieLevel have been seralized. We now
   465  	// serialize remaining nodes of each trie from root node (level 0) to (subtrieLevel - 1).
   466  	for _, t := range tries {
   467  		root := t.RootNode()
   468  		if root == nil {
   469  			continue
   470  		}
   471  		// if we iterate through the root trie with an empty visited nodes map, then it will iterate through
   472  		// all nodes at all levels. In order to skip the nodes above subtrieLevel, since they have been seralized in step 1,
   473  		// we will need to pass in a visited nodes map that contains all the subtrie root nodes, which is the topLevelNodes.
   474  		// The topLevelNodes was built in step 1, when seralizing each subtrie root nodes.
   475  		nodeCounter, err = storeUniqueNodes(root, topLevelNodes, nodeCounter, scratch, crc32Writer, func(uint64) {})
   476  		if err != nil {
   477  			return fmt.Errorf("fail to store nodes in step 2 for root trie %v: %w", root.Hash(), err)
   478  		}
   479  	}
   480  
   481  	// The root tries are seralized at the end of the checkpoint file, so that it's easy to find what tries are
   482  	// included.
   483  	for _, t := range tries {
   484  		rootNode := t.RootNode()
   485  		if !t.IsEmpty() && rootNode.Height() != ledger.NodeMaxHeight {
   486  			return fmt.Errorf("height of root node must be %d, but is %d",
   487  				ledger.NodeMaxHeight, rootNode.Height())
   488  		}
   489  
   490  		// Get root node index
   491  		rootIndex, found := topLevelNodes[rootNode]
   492  		if !found {
   493  			rootHash := t.RootHash()
   494  			return fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(rootHash[:]))
   495  		}
   496  
   497  		encTrie := flattener.EncodeTrie(t, rootIndex, scratch)
   498  		_, err = crc32Writer.Write(encTrie)
   499  		if err != nil {
   500  			return fmt.Errorf("cannot serialize trie: %w", err)
   501  		}
   502  	}
   503  
   504  	// all trie nodes have been seralized into the checkpoint file, now
   505  	// write footer with nodes count and tries count.
   506  	footer := scratch[:encNodeCountSize+encTrieCountSize]
   507  	binary.BigEndian.PutUint64(footer, nodeCounter-1) // -1 to account for 0 node meaning nil
   508  	binary.BigEndian.PutUint16(footer[encNodeCountSize:], uint16(len(tries)))
   509  
   510  	_, err = crc32Writer.Write(footer)
   511  	if err != nil {
   512  		return fmt.Errorf("cannot write checkpoint footer: %w", err)
   513  	}
   514  
   515  	// Write CRC32 sum of the footer for validation
   516  	crc32buf := scratch[:crc32SumSize]
   517  	binary.BigEndian.PutUint32(crc32buf, crc32Writer.Crc32())
   518  
   519  	_, err = writer.Write(crc32buf)
   520  	if err != nil {
   521  		return fmt.Errorf("cannot write CRC32: %w", err)
   522  	}
   523  
   524  	return nil
   525  }
   526  
   527  func logProgress(msg string, estimatedSubtrieNodeCount int, logger zerolog.Logger) func(nodeCounter uint64) {
   528  	lg := util.LogProgress(
   529  		logger,
   530  		util.DefaultLogProgressConfig(
   531  			msg,
   532  			estimatedSubtrieNodeCount,
   533  		),
   534  	)
   535  	return func(index uint64) {
   536  		lg(1)
   537  	}
   538  }
   539  
   540  // storeUniqueNodes iterates and serializes unique nodes for trie with given root node.
   541  // It also saves unique nodes and node counter in visitedNodes map.
   542  // It returns nodeCounter and error (if any).
   543  func storeUniqueNodes(
   544  	root *node.Node,
   545  	visitedNodes map[*node.Node]uint64,
   546  	nodeCounter uint64,
   547  	scratch []byte,
   548  	writer io.Writer,
   549  	nodeCounterUpdated func(nodeCounter uint64), // for logging estimated progress
   550  ) (uint64, error) {
   551  
   552  	for itr := flattener.NewUniqueNodeIterator(root, visitedNodes); itr.Next(); {
   553  		n := itr.Value()
   554  
   555  		visitedNodes[n] = nodeCounter
   556  		nodeCounter++
   557  		nodeCounterUpdated(nodeCounter)
   558  
   559  		var lchildIndex, rchildIndex uint64
   560  
   561  		if lchild := n.LeftChild(); lchild != nil {
   562  			var found bool
   563  			lchildIndex, found = visitedNodes[lchild]
   564  			if !found {
   565  				hash := lchild.Hash()
   566  				return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:]))
   567  			}
   568  		}
   569  		if rchild := n.RightChild(); rchild != nil {
   570  			var found bool
   571  			rchildIndex, found = visitedNodes[rchild]
   572  			if !found {
   573  				hash := rchild.Hash()
   574  				return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:]))
   575  			}
   576  		}
   577  
   578  		encNode := flattener.EncodeNode(n, lchildIndex, rchildIndex, scratch)
   579  		_, err := writer.Write(encNode)
   580  		if err != nil {
   581  			return 0, fmt.Errorf("cannot serialize node: %w", err)
   582  		}
   583  	}
   584  
   585  	return nodeCounter, nil
   586  }
   587  
   588  // getNodesAtLevel returns 2^level nodes at given level in breadth-first order.
   589  // It guarantees size and order of returned nodes (nil element if no node at the position).
   590  // For example, given nil root and level 3, getNodesAtLevel returns a slice
   591  // of 2^3 nil elements.
   592  func getNodesAtLevel(root *node.Node, level uint) []*node.Node {
   593  	nodes := []*node.Node{root}
   594  	nodesLevel := uint(0)
   595  
   596  	// Use breadth first traversal to get all nodes at given level.
   597  	// If a node isn't found, a nil node is used in its place.
   598  	for nodesLevel < level {
   599  		nextLevel := nodesLevel + 1
   600  		nodesAtNextLevel := make([]*node.Node, 1<<nextLevel)
   601  
   602  		for i, n := range nodes {
   603  			if n != nil {
   604  				nodesAtNextLevel[i*2] = n.LeftChild()
   605  				nodesAtNextLevel[i*2+1] = n.RightChild()
   606  			}
   607  		}
   608  
   609  		nodes = nodesAtNextLevel
   610  		nodesLevel = nextLevel
   611  	}
   612  
   613  	return nodes
   614  }
   615  
   616  func (c *Checkpointer) LoadCheckpoint(checkpoint int) ([]*trie.MTrie, error) {
   617  	filepath := path.Join(c.dir, NumberToFilename(checkpoint))
   618  	return LoadCheckpoint(filepath, c.wal.log)
   619  }
   620  
   621  func (c *Checkpointer) LoadRootCheckpoint() ([]*trie.MTrie, error) {
   622  	filepath := path.Join(c.dir, bootstrap.FilenameWALRootCheckpoint)
   623  	return LoadCheckpoint(filepath, c.wal.log)
   624  }
   625  
   626  func (c *Checkpointer) HasRootCheckpoint() (bool, error) {
   627  	return HasRootCheckpoint(c.dir)
   628  }
   629  
   630  func HasRootCheckpoint(dir string) (bool, error) {
   631  	if _, err := os.Stat(path.Join(dir, bootstrap.FilenameWALRootCheckpoint)); err == nil {
   632  		return true, nil
   633  	} else if os.IsNotExist(err) {
   634  		return false, nil
   635  	} else {
   636  		return false, err
   637  	}
   638  }
   639  
   640  func (c *Checkpointer) RemoveCheckpoint(checkpoint int) error {
   641  	name := NumberToFilename(checkpoint)
   642  	return deleteCheckpointFiles(c.dir, name)
   643  }
   644  
   645  func LoadCheckpoint(filepath string, logger zerolog.Logger) (
   646  	tries []*trie.MTrie,
   647  	errToReturn error) {
   648  	file, err := os.Open(filepath)
   649  	if err != nil {
   650  		return nil, fmt.Errorf("cannot open checkpoint file %s: %w", filepath, err)
   651  	}
   652  	defer func() {
   653  		evictErr := evictFileFromLinuxPageCache(file, false, logger)
   654  		if evictErr != nil {
   655  			logger.Warn().Msgf("failed to evict file %s from Linux page cache: %s", filepath, evictErr)
   656  			// No need to return this error because it's possible to continue normal operations.
   657  		}
   658  
   659  		errToReturn = closeAndMergeError(file, errToReturn)
   660  	}()
   661  
   662  	return readCheckpoint(file, logger)
   663  }
   664  
   665  func readCheckpoint(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) {
   666  
   667  	// Read header: magic (2 bytes) + version (2 bytes)
   668  	header := make([]byte, headerSize)
   669  	_, err := io.ReadFull(f, header)
   670  	if err != nil {
   671  		return nil, fmt.Errorf("cannot read header: %w", err)
   672  	}
   673  
   674  	// Decode header
   675  	magicBytes := binary.BigEndian.Uint16(header)
   676  	version := binary.BigEndian.Uint16(header[encMagicSize:])
   677  
   678  	// Reset offset
   679  	_, err = f.Seek(0, io.SeekStart)
   680  	if err != nil {
   681  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   682  	}
   683  
   684  	if magicBytes != MagicBytesCheckpointHeader {
   685  		return nil, fmt.Errorf("unknown file format. Magic constant %x does not match expected %x", magicBytes, MagicBytesCheckpointHeader)
   686  	}
   687  
   688  	switch version {
   689  	case VersionV1, VersionV3:
   690  		return readCheckpointV3AndEarlier(f, version)
   691  	case VersionV4:
   692  		return readCheckpointV4(f)
   693  	case VersionV5:
   694  		return readCheckpointV5(f, logger)
   695  	case VersionV6:
   696  		return readCheckpointV6(f, logger)
   697  	default:
   698  		return nil, fmt.Errorf("unsupported file version %x", version)
   699  	}
   700  }
   701  
   702  type nodeWithRegMetrics struct {
   703  	n        *node.Node
   704  	regCount uint64
   705  	regSize  uint64
   706  }
   707  
   708  // readCheckpointV3AndEarlier deserializes checkpoint file (version 3 and earlier) and returns a list of tries.
   709  // Header (magic and version) is verified by the caller.
   710  // This function is for backwards compatibility, not optimized.
   711  func readCheckpointV3AndEarlier(f *os.File, version uint16) ([]*trie.MTrie, error) {
   712  
   713  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   714  	crcReader := NewCRC32Reader(bufReader)
   715  
   716  	var reader io.Reader
   717  
   718  	if version != VersionV3 {
   719  		reader = bufReader
   720  	} else {
   721  		reader = crcReader
   722  	}
   723  
   724  	// Read header (magic + version), node count, and trie count.
   725  	header := make([]byte, headerSize+encNodeCountSize+encTrieCountSize)
   726  
   727  	_, err := io.ReadFull(reader, header)
   728  	if err != nil {
   729  		return nil, fmt.Errorf("cannot read header: %w", err)
   730  	}
   731  
   732  	// Magic and version are verified by the caller.
   733  
   734  	// Decode node count and trie count
   735  	nodesCount := binary.BigEndian.Uint64(header[headerSize:])
   736  	triesCount := binary.BigEndian.Uint16(header[headerSize+encNodeCountSize:])
   737  
   738  	nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil
   739  	tries := make([]*trie.MTrie, triesCount)
   740  
   741  	for i := uint64(1); i <= nodesCount; i++ {
   742  		n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   743  			if nodeIndex >= uint64(i) {
   744  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship")
   745  			}
   746  			nm := nodes[nodeIndex]
   747  			return nm.n, nm.regCount, nm.regSize, nil
   748  		})
   749  		if err != nil {
   750  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   751  		}
   752  		nodes[i].n = n
   753  		nodes[i].regCount = regCount
   754  		nodes[i].regSize = regSize
   755  	}
   756  
   757  	for i := uint16(0); i < triesCount; i++ {
   758  		trie, err := flattener.ReadTrieFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   759  			if nodeIndex >= uint64(len(nodes)) {
   760  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node")
   761  			}
   762  			nm := nodes[nodeIndex]
   763  			return nm.n, nm.regCount, nm.regSize, nil
   764  		})
   765  		if err != nil {
   766  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   767  		}
   768  		tries[i] = trie
   769  	}
   770  
   771  	if version == VersionV3 {
   772  		crc32buf := make([]byte, crc32SumSize)
   773  
   774  		_, err := io.ReadFull(bufReader, crc32buf)
   775  		if err != nil {
   776  			return nil, fmt.Errorf("cannot read CRC32: %w", err)
   777  		}
   778  
   779  		readCrc32 := binary.BigEndian.Uint32(crc32buf)
   780  
   781  		calculatedCrc32 := crcReader.Crc32()
   782  
   783  		if calculatedCrc32 != readCrc32 {
   784  			return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
   785  		}
   786  	}
   787  
   788  	return tries, nil
   789  }
   790  
   791  // readCheckpointV4 decodes checkpoint file (version 4) and returns a list of tries.
   792  // Header (magic and version) is verified by the caller.
   793  // This function is for backwards compatibility.
   794  func readCheckpointV4(f *os.File) ([]*trie.MTrie, error) {
   795  
   796  	// Scratch buffer is used as temporary buffer that reader can read into.
   797  	// Raw data in scratch buffer should be copied or converted into desired
   798  	// objects before next Read operation.  If the scratch buffer isn't large
   799  	// enough, a new buffer will be allocated.  However, 4096 bytes will
   800  	// be large enough to handle almost all payloads and 100% of interim nodes.
   801  	scratch := make([]byte, 1024*4) // must not be less than 1024
   802  
   803  	// Read footer to get node count and trie count
   804  
   805  	// footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes)
   806  	const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize
   807  	const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum
   808  
   809  	// Seek to footer
   810  	_, err := f.Seek(-footerOffset, io.SeekEnd)
   811  	if err != nil {
   812  		return nil, fmt.Errorf("cannot seek to footer: %w", err)
   813  	}
   814  
   815  	footer := scratch[:footerSize]
   816  
   817  	_, err = io.ReadFull(f, footer)
   818  	if err != nil {
   819  		return nil, fmt.Errorf("cannot read footer: %w", err)
   820  	}
   821  
   822  	// Decode node count and trie count
   823  	nodesCount := binary.BigEndian.Uint64(footer)
   824  	triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:])
   825  
   826  	// Seek to the start of file
   827  	_, err = f.Seek(0, io.SeekStart)
   828  	if err != nil {
   829  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   830  	}
   831  
   832  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   833  	crcReader := NewCRC32Reader(bufReader)
   834  	var reader io.Reader = crcReader
   835  
   836  	// Read header: magic (2 bytes) + version (2 bytes)
   837  	// No action is needed for header because it is verified by the caller.
   838  
   839  	_, err = io.ReadFull(reader, scratch[:headerSize])
   840  	if err != nil {
   841  		return nil, fmt.Errorf("cannot read header: %w", err)
   842  	}
   843  
   844  	// nodes's element at index 0 is a special, meaning nil .
   845  	nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil
   846  	tries := make([]*trie.MTrie, triesCount)
   847  
   848  	for i := uint64(1); i <= nodesCount; i++ {
   849  		n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   850  			if nodeIndex >= uint64(i) {
   851  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship")
   852  			}
   853  			nm := nodes[nodeIndex]
   854  			return nm.n, nm.regCount, nm.regSize, nil
   855  		})
   856  		if err != nil {
   857  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   858  		}
   859  		nodes[i].n = n
   860  		nodes[i].regCount = regCount
   861  		nodes[i].regSize = regSize
   862  	}
   863  
   864  	for i := uint16(0); i < triesCount; i++ {
   865  		trie, err := flattener.ReadTrieFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   866  			if nodeIndex >= uint64(len(nodes)) {
   867  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node")
   868  			}
   869  			nm := nodes[nodeIndex]
   870  			return nm.n, nm.regCount, nm.regSize, nil
   871  		})
   872  		if err != nil {
   873  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   874  		}
   875  		tries[i] = trie
   876  	}
   877  
   878  	// Read footer again for crc32 computation
   879  	// No action is needed.
   880  	_, err = io.ReadFull(reader, footer)
   881  	if err != nil {
   882  		return nil, fmt.Errorf("cannot read footer: %w", err)
   883  	}
   884  
   885  	// Read CRC32
   886  	crc32buf := scratch[:crc32SumSize]
   887  	_, err = io.ReadFull(bufReader, crc32buf)
   888  	if err != nil {
   889  		return nil, fmt.Errorf("cannot read CRC32: %w", err)
   890  	}
   891  
   892  	readCrc32 := binary.BigEndian.Uint32(crc32buf)
   893  
   894  	calculatedCrc32 := crcReader.Crc32()
   895  
   896  	if calculatedCrc32 != readCrc32 {
   897  		return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
   898  	}
   899  
   900  	return tries, nil
   901  }
   902  
   903  // readCheckpointV5 decodes checkpoint file (version 5) and returns a list of tries.
   904  // Checkpoint file header (magic and version) are verified by the caller.
   905  func readCheckpointV5(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) {
   906  	logger.Info().Msgf("reading v5 checkpoint file")
   907  
   908  	// Scratch buffer is used as temporary buffer that reader can read into.
   909  	// Raw data in scratch buffer should be copied or converted into desired
   910  	// objects before next Read operation.  If the scratch buffer isn't large
   911  	// enough, a new buffer will be allocated.  However, 4096 bytes will
   912  	// be large enough to handle almost all payloads and 100% of interim nodes.
   913  	scratch := make([]byte, 1024*4) // must not be less than 1024
   914  
   915  	// Read footer to get node count and trie count
   916  
   917  	// footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes)
   918  	const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize
   919  	const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum
   920  
   921  	// Seek to footer
   922  	_, err := f.Seek(-footerOffset, io.SeekEnd)
   923  	if err != nil {
   924  		return nil, fmt.Errorf("cannot seek to footer: %w", err)
   925  	}
   926  
   927  	footer := scratch[:footerSize]
   928  
   929  	_, err = io.ReadFull(f, footer)
   930  	if err != nil {
   931  		return nil, fmt.Errorf("cannot read footer: %w", err)
   932  	}
   933  
   934  	// Decode node count and trie count
   935  	nodesCount := binary.BigEndian.Uint64(footer)
   936  	triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:])
   937  
   938  	// Seek to the start of file
   939  	_, err = f.Seek(0, io.SeekStart)
   940  	if err != nil {
   941  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   942  	}
   943  
   944  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   945  	crcReader := NewCRC32Reader(bufReader)
   946  	var reader io.Reader = crcReader
   947  
   948  	// Read header: magic (2 bytes) + version (2 bytes)
   949  	// No action is needed for header because it is verified by the caller.
   950  
   951  	_, err = io.ReadFull(reader, scratch[:headerSize])
   952  	if err != nil {
   953  		return nil, fmt.Errorf("cannot read header: %w", err)
   954  	}
   955  
   956  	// nodes's element at index 0 is a special, meaning nil .
   957  	nodes := make([]*node.Node, nodesCount+1) //+1 for 0 index meaning nil
   958  	tries := make([]*trie.MTrie, triesCount)
   959  
   960  	logging := logProgress("reading trie nodes", int(nodesCount), logger)
   961  
   962  	for i := uint64(1); i <= nodesCount; i++ {
   963  		n, err := flattener.ReadNode(reader, scratch, func(nodeIndex uint64) (*node.Node, error) {
   964  			if nodeIndex >= uint64(i) {
   965  				return nil, fmt.Errorf("sequence of serialized nodes does not satisfy Descendents-First-Relationship")
   966  			}
   967  			return nodes[nodeIndex], nil
   968  		})
   969  		if err != nil {
   970  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   971  		}
   972  		nodes[i] = n
   973  		logging(i)
   974  	}
   975  
   976  	logger.Info().Msgf("finished loading %v trie nodes, start loading %v tries", nodesCount, triesCount)
   977  
   978  	for i := uint16(0); i < triesCount; i++ {
   979  		trie, err := flattener.ReadTrie(reader, scratch, func(nodeIndex uint64) (*node.Node, error) {
   980  			if nodeIndex >= uint64(len(nodes)) {
   981  				return nil, fmt.Errorf("sequence of stored nodes doesn't contain node")
   982  			}
   983  			return nodes[nodeIndex], nil
   984  		})
   985  		if err != nil {
   986  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   987  		}
   988  		tries[i] = trie
   989  	}
   990  
   991  	// Read footer again for crc32 computation
   992  	// No action is needed.
   993  	_, err = io.ReadFull(reader, footer)
   994  	if err != nil {
   995  		return nil, fmt.Errorf("cannot read footer: %w", err)
   996  	}
   997  
   998  	// Read CRC32
   999  	crc32buf := scratch[:crc32SumSize]
  1000  	_, err = io.ReadFull(bufReader, crc32buf)
  1001  	if err != nil {
  1002  		return nil, fmt.Errorf("cannot read CRC32: %w", err)
  1003  	}
  1004  
  1005  	readCrc32 := binary.BigEndian.Uint32(crc32buf)
  1006  
  1007  	calculatedCrc32 := crcReader.Crc32()
  1008  
  1009  	if calculatedCrc32 != readCrc32 {
  1010  		return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
  1011  	}
  1012  
  1013  	return tries, nil
  1014  }
  1015  
  1016  // evictFileFromLinuxPageCache advises Linux to evict a file from Linux page cache.
  1017  // A use case is when a new checkpoint is loaded or created, Linux may cache big
  1018  // checkpoint files in memory until evictFileFromLinuxPageCache causes them to be
  1019  // evicted from the Linux page cache.  Not calling eviceFileFromLinuxPageCache()
  1020  // causes two checkpoint files to be cached for each checkpointing, eventually
  1021  // caching hundreds of GB.
  1022  // CAUTION: no-op when GOOS != linux.
  1023  func evictFileFromLinuxPageCache(f *os.File, fsync bool, logger zerolog.Logger) error {
  1024  	err := fadviseNoLinuxPageCache(f.Fd(), fsync)
  1025  	if err != nil {
  1026  		return err
  1027  	}
  1028  
  1029  	size := int64(0)
  1030  	fstat, err := f.Stat()
  1031  	if err == nil {
  1032  		size = fstat.Size()
  1033  	}
  1034  
  1035  	logger.Info().Str("filename", f.Name()).Int64("size_mb", size/1024/1024).Msg("evicted file from Linux page cache")
  1036  	return nil
  1037  }
  1038  
  1039  // Copy the checkpoint file including the part files from the given `from` to
  1040  // the `to` directory
  1041  // it returns the path of all the copied files
  1042  // any error returned are exceptions
  1043  func CopyCheckpointFile(filename string, from string, to string) (
  1044  	[]string,
  1045  	error,
  1046  ) {
  1047  	// It's possible that the trie dir does not yet exist. If not this will create the the required path
  1048  	err := os.MkdirAll(to, 0700)
  1049  	if err != nil {
  1050  		return nil, err
  1051  	}
  1052  
  1053  	// checkpoint V6 produces multiple checkpoint part files that need to be copied over
  1054  	pattern := filePathPattern(from, filename)
  1055  	matched, err := filepath.Glob(pattern)
  1056  	if err != nil {
  1057  		return nil, fmt.Errorf("could not glob checkpoint file with pattern %v: %w", pattern, err)
  1058  	}
  1059  
  1060  	newPaths := make([]string, len(matched))
  1061  	// copy the root checkpoint concurrently
  1062  	var group errgroup.Group
  1063  
  1064  	for i, match := range matched {
  1065  		_, partfile := filepath.Split(match)
  1066  		newPath := filepath.Join(to, partfile)
  1067  		newPaths[i] = newPath
  1068  
  1069  		match := match
  1070  		group.Go(func() error {
  1071  			err := utilsio.Copy(match, newPath)
  1072  			if err != nil {
  1073  				return fmt.Errorf("cannot copy file from %v to %v", match, newPath)
  1074  			}
  1075  			return nil
  1076  		})
  1077  	}
  1078  
  1079  	err = group.Wait()
  1080  	if err != nil {
  1081  		return nil, fmt.Errorf("fail to copy checkpoint files: %w", err)
  1082  	}
  1083  
  1084  	return newPaths, nil
  1085  }