github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/ledger/complete/wal/checkpointer.go (about)

     1  package wal
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/binary"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"path"
    11  	"path/filepath"
    12  	"sort"
    13  	"strconv"
    14  	"strings"
    15  
    16  	"github.com/docker/go-units"
    17  	"github.com/rs/zerolog"
    18  	"github.com/rs/zerolog/log"
    19  	"golang.org/x/sync/errgroup"
    20  
    21  	"github.com/onflow/flow-go/ledger"
    22  	"github.com/onflow/flow-go/ledger/complete/mtrie"
    23  	"github.com/onflow/flow-go/ledger/complete/mtrie/flattener"
    24  	"github.com/onflow/flow-go/ledger/complete/mtrie/node"
    25  	"github.com/onflow/flow-go/ledger/complete/mtrie/trie"
    26  	"github.com/onflow/flow-go/model/bootstrap"
    27  	"github.com/onflow/flow-go/module/metrics"
    28  	"github.com/onflow/flow-go/module/util"
    29  	utilsio "github.com/onflow/flow-go/utils/io"
    30  )
    31  
    32  const checkpointFilenamePrefix = "checkpoint."
    33  
    34  const (
    35  	MagicBytesCheckpointHeader  uint16 = 0x2137
    36  	MagicBytesCheckpointSubtrie uint16 = 0x2136
    37  	MagicBytesCheckpointToptrie uint16 = 0x2135
    38  	MagicBytesPayloadHeader     uint16 = 0x2138
    39  )
    40  
    41  const VersionV1 uint16 = 0x01
    42  
    43  // Versions was reset while changing trie format, so now bump it to 3 to avoid conflicts
    44  // Version 3 contains a file checksum for detecting corrupted checkpoint files.
    45  const VersionV3 uint16 = 0x03
    46  
    47  // Version 4 contains a footer with node count and trie count (previously in the header).
    48  // Version 4 also reduces checkpoint data size.  See EncodeNode() and EncodeTrie() for more details.
    49  const VersionV4 uint16 = 0x04
    50  
    51  // Version 5 includes these changes:
    52  // - remove regCount and maxDepth from serialized nodes
    53  // - add allocated register count and size to serialized tries
    54  // - reduce number of bytes used to encode payload value size from 8 bytes to 4 bytes.
    55  // See EncodeNode() and EncodeTrie() for more details.
    56  const VersionV5 uint16 = 0x05
    57  
    58  // Version 6 includes these changes:
    59  //   - trie nodes are stored in additional 17 checkpoint files, with .0, .1, .2, ... .16 as
    60  //     file name extension
    61  const VersionV6 uint16 = 0x06
    62  
    63  // MaxVersion is the latest checkpoint version we support.
    64  // Need to update MaxVersion when creating a newer version.
    65  const MaxVersion = VersionV6
    66  
    67  const (
    68  	encMagicSize        = 2
    69  	encVersionSize      = 2
    70  	headerSize          = encMagicSize + encVersionSize
    71  	encSubtrieCountSize = 2
    72  	encNodeCountSize    = 8
    73  	encTrieCountSize    = 2
    74  	crc32SumSize        = 4
    75  )
    76  
    77  // defaultBufioReadSize replaces the default bufio buffer size of 4096 bytes.
    78  // defaultBufioReadSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it
    79  // improves performance on typical EN hardware.
    80  const defaultBufioReadSize = 1024 * 32
    81  
    82  // defaultBufioWriteSize replaces the default bufio buffer size of 4096 bytes.
    83  // defaultBufioWriteSize can be increased to 8KiB, 16KiB, 32KiB, etc. if it
    84  // improves performance on typical EN hardware.
    85  const defaultBufioWriteSize = 1024 * 32
    86  
    87  type Checkpointer struct {
    88  	dir            string
    89  	wal            *DiskWAL
    90  	keyByteSize    int
    91  	forestCapacity int
    92  }
    93  
    94  func NewCheckpointer(wal *DiskWAL, keyByteSize int, forestCapacity int) *Checkpointer {
    95  	return &Checkpointer{
    96  		dir:            wal.wal.Dir(),
    97  		wal:            wal,
    98  		keyByteSize:    keyByteSize,
    99  		forestCapacity: forestCapacity,
   100  	}
   101  }
   102  
   103  // listCheckpoints returns all the numbers (unsorted) of the checkpoint files, and the number of the last checkpoint.
   104  func (c *Checkpointer) listCheckpoints() ([]int, int, error) {
   105  	return ListCheckpoints(c.dir)
   106  }
   107  
   108  // ListCheckpoints returns all the numbers of the checkpoint files, and the number of the last checkpoint.
   109  // note, it doesn't include the root checkpoint file
   110  func ListCheckpoints(dir string) ([]int, int, error) {
   111  	list := make([]int, 0)
   112  
   113  	files, err := os.ReadDir(dir)
   114  	if err != nil {
   115  		return nil, -1, fmt.Errorf("cannot list directory [%s] content: %w", dir, err)
   116  	}
   117  	last := -1
   118  	for _, fn := range files {
   119  		fname := fn.Name()
   120  		if !strings.HasPrefix(fname, checkpointFilenamePrefix) {
   121  			continue
   122  		}
   123  		justNumber := fname[len(checkpointFilenamePrefix):]
   124  		k, err := strconv.Atoi(justNumber)
   125  		if err != nil {
   126  			continue
   127  		}
   128  
   129  		list = append(list, k)
   130  
   131  		// the last check point is the one with the highest number
   132  		if k > last {
   133  			last = k
   134  		}
   135  	}
   136  
   137  	return list, last, nil
   138  }
   139  
   140  // Checkpoints returns all the numbers of the checkpoint files in asc order.
   141  // note, it doesn't include the root checkpoint file
   142  func (c *Checkpointer) Checkpoints() ([]int, error) {
   143  	return Checkpoints(c.dir)
   144  }
   145  
   146  // Checkpoints returns all the checkpoint numbers in asc order
   147  func Checkpoints(dir string) ([]int, error) {
   148  	list, _, err := ListCheckpoints(dir)
   149  	if err != nil {
   150  		return nil, fmt.Errorf("could not fetch all checkpoints: %w", err)
   151  	}
   152  
   153  	sort.Ints(list)
   154  
   155  	return list, nil
   156  }
   157  
   158  // LatestCheckpoint returns number of latest checkpoint or -1 if there are no checkpoints
   159  func (c *Checkpointer) LatestCheckpoint() (int, error) {
   160  	_, last, err := c.listCheckpoints()
   161  	return last, err
   162  }
   163  
   164  // NotCheckpointedSegments - returns numbers of segments which are not checkpointed yet,
   165  // or -1, -1 if there are no segments
   166  func (c *Checkpointer) NotCheckpointedSegments() (from, to int, err error) {
   167  
   168  	latestCheckpoint, err := c.LatestCheckpoint()
   169  	if err != nil {
   170  		return -1, -1, fmt.Errorf("cannot get last checkpoint: %w", err)
   171  	}
   172  
   173  	first, last, err := c.wal.Segments()
   174  	if err != nil {
   175  		return -1, -1, fmt.Errorf("cannot get range of segments: %w", err)
   176  	}
   177  
   178  	// there are no segments at all, there is nothing to checkpoint
   179  	if first == -1 && last == -1 {
   180  		return -1, -1, nil
   181  	}
   182  
   183  	// no checkpoints
   184  	if latestCheckpoint == -1 {
   185  		return first, last, nil
   186  	}
   187  
   188  	// segments before checkpoint
   189  	if last <= latestCheckpoint {
   190  		return -1, -1, nil
   191  	}
   192  
   193  	// there is gap between last checkpoint and segments
   194  	if last > latestCheckpoint && latestCheckpoint < first-1 {
   195  		return -1, -1, fmt.Errorf("gap between last checkpoint and segments")
   196  	}
   197  
   198  	return latestCheckpoint + 1, last, nil
   199  }
   200  
   201  // Checkpoint creates new checkpoint stopping at given segment
   202  func (c *Checkpointer) Checkpoint(to int) (err error) {
   203  
   204  	_, notCheckpointedTo, err := c.NotCheckpointedSegments()
   205  	if err != nil {
   206  		return fmt.Errorf("cannot get not checkpointed segments: %w", err)
   207  	}
   208  
   209  	latestCheckpoint, err := c.LatestCheckpoint()
   210  	if err != nil {
   211  		return fmt.Errorf("cannot get latest checkpoint: %w", err)
   212  	}
   213  
   214  	if latestCheckpoint == to {
   215  		return nil //nothing to do
   216  	}
   217  
   218  	if notCheckpointedTo < to {
   219  		return fmt.Errorf("no segments to checkpoint to %d, latests not checkpointed segment: %d", to, notCheckpointedTo)
   220  	}
   221  
   222  	forest, err := mtrie.NewForest(c.forestCapacity, &metrics.NoopCollector{}, nil)
   223  	if err != nil {
   224  		return fmt.Errorf("cannot create Forest: %w", err)
   225  	}
   226  
   227  	c.wal.log.Info().Msgf("creating checkpoint %d", to)
   228  
   229  	err = c.wal.replay(0, to,
   230  		func(tries []*trie.MTrie) error {
   231  			return forest.AddTries(tries)
   232  		},
   233  		func(update *ledger.TrieUpdate) error {
   234  			_, err := forest.Update(update)
   235  			return err
   236  		}, func(rootHash ledger.RootHash) error {
   237  			return nil
   238  		}, true)
   239  
   240  	if err != nil {
   241  		return fmt.Errorf("cannot replay WAL: %w", err)
   242  	}
   243  
   244  	tries, err := forest.GetTries()
   245  	if err != nil {
   246  		return fmt.Errorf("cannot get forest tries: %w", err)
   247  	}
   248  
   249  	c.wal.log.Info().Msgf("serializing checkpoint %d", to)
   250  
   251  	fileName := NumberToFilename(to)
   252  
   253  	err = StoreCheckpointV6SingleThread(tries, c.wal.dir, fileName, c.wal.log)
   254  
   255  	if err != nil {
   256  		return fmt.Errorf("could not create checkpoint for %v: %w", to, err)
   257  	}
   258  
   259  	checkpointFileSize, err := ReadCheckpointFileSize(c.wal.dir, fileName)
   260  	if err != nil {
   261  		return fmt.Errorf("could not read checkpoint file size: %w", err)
   262  	}
   263  
   264  	c.wal.log.Info().
   265  		Str("checkpoint_file_size", units.BytesSize(float64(checkpointFileSize))).
   266  		Msgf("created checkpoint %d with %d tries", to, len(tries))
   267  
   268  	return nil
   269  }
   270  
   271  func NumberToFilenamePart(n int) string {
   272  	return fmt.Sprintf("%08d", n)
   273  }
   274  
   275  func NumberToFilename(n int) string {
   276  
   277  	return fmt.Sprintf("%s%s", checkpointFilenamePrefix, NumberToFilenamePart(n))
   278  }
   279  
   280  func (c *Checkpointer) CheckpointWriter(to int) (io.WriteCloser, error) {
   281  	return CreateCheckpointWriterForFile(c.dir, NumberToFilename(to), c.wal.log)
   282  }
   283  
   284  func (c *Checkpointer) Dir() string {
   285  	return c.dir
   286  }
   287  
   288  // CreateCheckpointWriterForFile returns a file writer that will write to a temporary file and then move it to the checkpoint folder by renaming it.
   289  func CreateCheckpointWriterForFile(dir, filename string, logger zerolog.Logger) (io.WriteCloser, error) {
   290  
   291  	fullname := path.Join(dir, filename)
   292  
   293  	if utilsio.FileExists(fullname) {
   294  		return nil, fmt.Errorf("checkpoint file %s already exists", fullname)
   295  	}
   296  
   297  	tmpFile, err := os.CreateTemp(dir, "writing-chkpnt-*")
   298  	if err != nil {
   299  		return nil, fmt.Errorf("cannot create temporary file for checkpoint %v: %w", tmpFile, err)
   300  	}
   301  
   302  	writer := bufio.NewWriterSize(tmpFile, defaultBufioWriteSize)
   303  	return &SyncOnCloseRenameFile{
   304  		logger:     logger,
   305  		file:       tmpFile,
   306  		targetName: fullname,
   307  		Writer:     writer,
   308  	}, nil
   309  }
   310  
   311  // StoreCheckpointV5 writes the given tries to checkpoint file, and also appends
   312  // a CRC32 file checksum for integrity check.
   313  // Checkpoint file consists of a flattened forest. Specifically, it consists of:
   314  //   - a list of encoded nodes, where references to other nodes are by list index.
   315  //   - a list of encoded tries, each referencing their respective root node by index.
   316  //
   317  // Referencing to other nodes by index 0 is a special case, meaning nil.
   318  //
   319  // As an important property, the nodes are listed in an order which satisfies
   320  // Descendents-First-Relationship. The Descendents-First-Relationship has the
   321  // following important property:
   322  // When rebuilding the trie from the sequence of nodes, build the trie on the fly,
   323  // as for each node, the children have been previously encountered.
   324  // TODO: evaluate alternatives to CRC32 since checkpoint file is many GB in size.
   325  // TODO: add concurrency if the performance gains are enough to offset complexity.
   326  func StoreCheckpointV5(dir string, fileName string, logger zerolog.Logger, tries ...*trie.MTrie) (
   327  	// error
   328  	// Note, the above code, which didn't define the name "err" for the returned error, would be wrong,
   329  	// beause err needs to be defined in order to be updated by the defer function
   330  	errToReturn error,
   331  ) {
   332  	writer, err := CreateCheckpointWriterForFile(dir, fileName, logger)
   333  	if err != nil {
   334  		return fmt.Errorf("could not create writer: %w", err)
   335  	}
   336  	defer func() {
   337  		errToReturn = closeAndMergeError(writer, errToReturn)
   338  	}()
   339  
   340  	crc32Writer := NewCRC32Writer(writer)
   341  
   342  	// Scratch buffer is used as temporary buffer that node can encode into.
   343  	// Data in scratch buffer should be copied or used before scratch buffer is used again.
   344  	// If the scratch buffer isn't large enough, a new buffer will be allocated.
   345  	// However, 4096 bytes will be large enough to handle almost all payloads
   346  	// and 100% of interim nodes.
   347  	scratch := make([]byte, 1024*4)
   348  
   349  	// Write header: magic (2 bytes) + version (2 bytes)
   350  	header := scratch[:headerSize]
   351  	binary.BigEndian.PutUint16(header, MagicBytesCheckpointHeader)
   352  	binary.BigEndian.PutUint16(header[encMagicSize:], VersionV5)
   353  
   354  	_, err = crc32Writer.Write(header)
   355  	if err != nil {
   356  		return fmt.Errorf("cannot write checkpoint header: %w", err)
   357  	}
   358  
   359  	// Multiple tries might have shared nodes at higher level, However, we don't want to
   360  	// seralize duplicated nodes in the checkpoint file. In order to deduplicate, we build
   361  	// a map from unique nodes while iterating and seralizing the nodes to the checkpoint file.
   362  	//
   363  	// The map for deduplication contains all the trie nodes, which uses a lot of memory.
   364  	// In fact, we don't have to build a map for all nodes, since there are nodes which
   365  	// are never shared.  Nodes can only be shared if and only if they are
   366  	// on the same path. In other words, nodes on different path won't be shared.
   367  	// If we group trie nodes by path, then we have more smaller groups of trie nodes from the same path,
   368  	// which might have duplication. And then for each group, we could build a smaller map for deduplication.
   369  	// Processing each group sequentially would allow us reduce operational memory.
   370  	//
   371  	// With this idea in mind, the seralization can be done in two steps:
   372  	// 1. serialize nodes in subtries (tries with root at subtrieLevel).
   373  	// 2. serialize remaining nodes (from trie root to subtrie root).
   374  	// For instance, if there are 3 top tries, and subtrieLevel is 4, then there will be
   375  	// 	(2 ^ 4) * 3 = 48 subtrie root nodes at level 4.
   376  	// Then step 1 will seralize the 48 subtrie root nodes into the checkpoint file, and
   377  	// then step 2 will seralize the 3 root nodes (level 0) and the interim nodes from level 1 to 3 into
   378  	//
   379  	// Step 1:
   380  	// 1. Find all the subtrie root nodes at subtrieLevel (level 4)
   381  	// 2. Group the subtrie by path. Since subtries in different group have different path, they won't have
   382  	//		child nodes shared. Subtries in the same group might have duplication, we will build a map to deduplicate.
   383  	//
   384  	// subtrieLevel is number of edges from trie root to subtrie root.
   385  	// Trie root is at level 0.
   386  	const subtrieLevel = 4
   387  
   388  	// subtrieCount is number of subtries at subtrieLevel.
   389  	const subtrieCount = 1 << subtrieLevel
   390  
   391  	// since each trie has `subtrieCount` number of subtries at subtrieLevel,
   392  	// we create `subtrieCount` number of groups, each group contains all the subtrie root nodes
   393  
   394  	// subtrieRoots is an array of groups.
   395  	// Each group contains the subtrie roots of the same path at subtrieLevel for different tries.
   396  	// For example, if subtrieLevel is 4, then
   397  	// - subtrieRoots[0] is a list of all subtrie roots at path [0,0,0,0]
   398  	// - subtrieRoots[1] is a list of all subtrie roots at path [0,0,0,1]
   399  	// - subtrieRoots[subtrieCount-1] is a list of all subtrie roots at path [1,1,1,1]
   400  	// subtrie roots in subtrieRoots[0] have the same path, therefore might have shared child nodes.
   401  	var subtrieRoots [subtrieCount][]*node.Node
   402  	for i := 0; i < len(subtrieRoots); i++ {
   403  		subtrieRoots[i] = make([]*node.Node, len(tries))
   404  	}
   405  
   406  	for trieIndex, t := range tries {
   407  		// subtries is an array with subtrieCount trie nodes
   408  		// in breadth-first order at subtrieLevel of the trie `t`
   409  		subtries := getNodesAtLevel(t.RootNode(), subtrieLevel)
   410  		for subtrieIndex, subtrieRoot := range subtries {
   411  			subtrieRoots[subtrieIndex][trieIndex] = subtrieRoot
   412  		}
   413  	}
   414  
   415  	// topLevelNodes contains all unique nodes of given tries
   416  	// from root to subtrie root and their index
   417  	// (ordered by node traversal sequence).
   418  	// Index 0 is a special case with nil node.
   419  	topLevelNodes := make(map[*node.Node]uint64, 1<<(subtrieLevel+1))
   420  	topLevelNodes[nil] = 0
   421  
   422  	// nodeCounter is counter for all unique nodes.
   423  	// It starts from 1, as 0 marks nil node.
   424  	nodeCounter := uint64(1)
   425  
   426  	// estimatedSubtrieNodeCount is rough estimate of number of nodes in subtrie,
   427  	// assuming trie is a full binary tree.  estimatedSubtrieNodeCount is used
   428  	// to preallocate traversedSubtrieNodes for memory efficiency.
   429  	estimatedSubtrieNodeCount := 0
   430  	if len(tries) > 0 {
   431  		estimatedTrieNodeCount := 2*int(tries[0].AllocatedRegCount()) - 1
   432  		estimatedSubtrieNodeCount = estimatedTrieNodeCount / subtrieCount
   433  	}
   434  
   435  	// Serialize subtrie nodes
   436  	for i, subTrieRoot := range subtrieRoots {
   437  		// traversedSubtrieNodes contains all unique nodes of subtries of the same path and their index.
   438  		traversedSubtrieNodes := make(map[*node.Node]uint64, estimatedSubtrieNodeCount)
   439  		// Index 0 is a special case with nil node.
   440  		traversedSubtrieNodes[nil] = 0
   441  
   442  		logging := logProgress(fmt.Sprintf("storing %v-th sub trie roots", i), estimatedSubtrieNodeCount, log.Logger)
   443  		for _, root := range subTrieRoot {
   444  			// Empty trie is always added to forest as starting point and
   445  			// empty trie's root is nil. It remains in the forest until evicted
   446  			// by trie queue exceeding capacity.
   447  			if root == nil {
   448  				continue
   449  			}
   450  			// Note: nodeCounter is to assign an global index to each node in the order of it being seralized
   451  			// into the checkpoint file. Therefore, it has to be reused when iterating each subtrie.
   452  			// storeUniqueNodes will add the unique visited node into traversedSubtrieNodes with key as the node
   453  			// itself, and value as n-th node being seralized in the checkpoint file.
   454  			nodeCounter, err = storeUniqueNodes(root, traversedSubtrieNodes, nodeCounter, scratch, crc32Writer, logging)
   455  			if err != nil {
   456  				return fmt.Errorf("fail to store nodes in step 1 for subtrie root %v: %w", root.Hash(), err)
   457  			}
   458  			// Save subtrie root node index in topLevelNodes,
   459  			// so when traversing top level tries
   460  			// (from level 0 to subtrieLevel) using topLevelNodes,
   461  			// node iterator skips subtrie as visited nodes.
   462  			topLevelNodes[root] = traversedSubtrieNodes[root]
   463  		}
   464  	}
   465  
   466  	// Step 2:
   467  	// Now all nodes above and include the subtrieLevel have been seralized. We now
   468  	// serialize remaining nodes of each trie from root node (level 0) to (subtrieLevel - 1).
   469  	for _, t := range tries {
   470  		root := t.RootNode()
   471  		if root == nil {
   472  			continue
   473  		}
   474  		// if we iterate through the root trie with an empty visited nodes map, then it will iterate through
   475  		// all nodes at all levels. In order to skip the nodes above subtrieLevel, since they have been seralized in step 1,
   476  		// we will need to pass in a visited nodes map that contains all the subtrie root nodes, which is the topLevelNodes.
   477  		// The topLevelNodes was built in step 1, when seralizing each subtrie root nodes.
   478  		nodeCounter, err = storeUniqueNodes(root, topLevelNodes, nodeCounter, scratch, crc32Writer, func(uint64) {})
   479  		if err != nil {
   480  			return fmt.Errorf("fail to store nodes in step 2 for root trie %v: %w", root.Hash(), err)
   481  		}
   482  	}
   483  
   484  	// The root tries are seralized at the end of the checkpoint file, so that it's easy to find what tries are
   485  	// included.
   486  	for _, t := range tries {
   487  		rootNode := t.RootNode()
   488  		if !t.IsEmpty() && rootNode.Height() != ledger.NodeMaxHeight {
   489  			return fmt.Errorf("height of root node must be %d, but is %d",
   490  				ledger.NodeMaxHeight, rootNode.Height())
   491  		}
   492  
   493  		// Get root node index
   494  		rootIndex, found := topLevelNodes[rootNode]
   495  		if !found {
   496  			rootHash := t.RootHash()
   497  			return fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(rootHash[:]))
   498  		}
   499  
   500  		encTrie := flattener.EncodeTrie(t, rootIndex, scratch)
   501  		_, err = crc32Writer.Write(encTrie)
   502  		if err != nil {
   503  			return fmt.Errorf("cannot serialize trie: %w", err)
   504  		}
   505  	}
   506  
   507  	// all trie nodes have been seralized into the checkpoint file, now
   508  	// write footer with nodes count and tries count.
   509  	footer := scratch[:encNodeCountSize+encTrieCountSize]
   510  	binary.BigEndian.PutUint64(footer, nodeCounter-1) // -1 to account for 0 node meaning nil
   511  	binary.BigEndian.PutUint16(footer[encNodeCountSize:], uint16(len(tries)))
   512  
   513  	_, err = crc32Writer.Write(footer)
   514  	if err != nil {
   515  		return fmt.Errorf("cannot write checkpoint footer: %w", err)
   516  	}
   517  
   518  	// Write CRC32 sum of the footer for validation
   519  	crc32buf := scratch[:crc32SumSize]
   520  	binary.BigEndian.PutUint32(crc32buf, crc32Writer.Crc32())
   521  
   522  	_, err = writer.Write(crc32buf)
   523  	if err != nil {
   524  		return fmt.Errorf("cannot write CRC32: %w", err)
   525  	}
   526  
   527  	return nil
   528  }
   529  
   530  func logProgress(msg string, estimatedSubtrieNodeCount int, logger zerolog.Logger) func(nodeCounter uint64) {
   531  	lg := util.LogProgress(
   532  		logger,
   533  		util.DefaultLogProgressConfig(
   534  			msg,
   535  			estimatedSubtrieNodeCount,
   536  		),
   537  	)
   538  	return func(index uint64) {
   539  		lg(1)
   540  	}
   541  }
   542  
   543  // storeUniqueNodes iterates and serializes unique nodes for trie with given root node.
   544  // It also saves unique nodes and node counter in visitedNodes map.
   545  // It returns nodeCounter and error (if any).
   546  func storeUniqueNodes(
   547  	root *node.Node,
   548  	visitedNodes map[*node.Node]uint64,
   549  	nodeCounter uint64,
   550  	scratch []byte,
   551  	writer io.Writer,
   552  	nodeCounterUpdated func(nodeCounter uint64), // for logging estimated progress
   553  ) (uint64, error) {
   554  
   555  	for itr := flattener.NewUniqueNodeIterator(root, visitedNodes); itr.Next(); {
   556  		n := itr.Value()
   557  
   558  		visitedNodes[n] = nodeCounter
   559  		nodeCounter++
   560  		nodeCounterUpdated(nodeCounter)
   561  
   562  		var lchildIndex, rchildIndex uint64
   563  
   564  		if lchild := n.LeftChild(); lchild != nil {
   565  			var found bool
   566  			lchildIndex, found = visitedNodes[lchild]
   567  			if !found {
   568  				hash := lchild.Hash()
   569  				return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:]))
   570  			}
   571  		}
   572  		if rchild := n.RightChild(); rchild != nil {
   573  			var found bool
   574  			rchildIndex, found = visitedNodes[rchild]
   575  			if !found {
   576  				hash := rchild.Hash()
   577  				return 0, fmt.Errorf("internal error: missing node with hash %s", hex.EncodeToString(hash[:]))
   578  			}
   579  		}
   580  
   581  		encNode := flattener.EncodeNode(n, lchildIndex, rchildIndex, scratch)
   582  		_, err := writer.Write(encNode)
   583  		if err != nil {
   584  			return 0, fmt.Errorf("cannot serialize node: %w", err)
   585  		}
   586  	}
   587  
   588  	return nodeCounter, nil
   589  }
   590  
   591  // getNodesAtLevel returns 2^level nodes at given level in breadth-first order.
   592  // It guarantees size and order of returned nodes (nil element if no node at the position).
   593  // For example, given nil root and level 3, getNodesAtLevel returns a slice
   594  // of 2^3 nil elements.
   595  func getNodesAtLevel(root *node.Node, level uint) []*node.Node {
   596  	nodes := []*node.Node{root}
   597  	nodesLevel := uint(0)
   598  
   599  	// Use breadth first traversal to get all nodes at given level.
   600  	// If a node isn't found, a nil node is used in its place.
   601  	for nodesLevel < level {
   602  		nextLevel := nodesLevel + 1
   603  		nodesAtNextLevel := make([]*node.Node, 1<<nextLevel)
   604  
   605  		for i, n := range nodes {
   606  			if n != nil {
   607  				nodesAtNextLevel[i*2] = n.LeftChild()
   608  				nodesAtNextLevel[i*2+1] = n.RightChild()
   609  			}
   610  		}
   611  
   612  		nodes = nodesAtNextLevel
   613  		nodesLevel = nextLevel
   614  	}
   615  
   616  	return nodes
   617  }
   618  
   619  func (c *Checkpointer) LoadCheckpoint(checkpoint int) ([]*trie.MTrie, error) {
   620  	filepath := path.Join(c.dir, NumberToFilename(checkpoint))
   621  	return LoadCheckpoint(filepath, c.wal.log)
   622  }
   623  
   624  func (c *Checkpointer) LoadRootCheckpoint() ([]*trie.MTrie, error) {
   625  	filepath := path.Join(c.dir, bootstrap.FilenameWALRootCheckpoint)
   626  	return LoadCheckpoint(filepath, c.wal.log)
   627  }
   628  
   629  func (c *Checkpointer) HasRootCheckpoint() (bool, error) {
   630  	return HasRootCheckpoint(c.dir)
   631  }
   632  
   633  func HasRootCheckpoint(dir string) (bool, error) {
   634  	if _, err := os.Stat(path.Join(dir, bootstrap.FilenameWALRootCheckpoint)); err == nil {
   635  		return true, nil
   636  	} else if os.IsNotExist(err) {
   637  		return false, nil
   638  	} else {
   639  		return false, err
   640  	}
   641  }
   642  
   643  func (c *Checkpointer) RemoveCheckpoint(checkpoint int) error {
   644  	name := NumberToFilename(checkpoint)
   645  	return deleteCheckpointFiles(c.dir, name)
   646  }
   647  
   648  func LoadCheckpoint(filepath string, logger zerolog.Logger) (
   649  	tries []*trie.MTrie,
   650  	errToReturn error) {
   651  	file, err := os.Open(filepath)
   652  	if err != nil {
   653  		return nil, fmt.Errorf("cannot open checkpoint file %s: %w", filepath, err)
   654  	}
   655  	defer func() {
   656  		evictErr := evictFileFromLinuxPageCache(file, false, logger)
   657  		if evictErr != nil {
   658  			logger.Warn().Msgf("failed to evict file %s from Linux page cache: %s", filepath, evictErr)
   659  			// No need to return this error because it's possible to continue normal operations.
   660  		}
   661  
   662  		errToReturn = closeAndMergeError(file, errToReturn)
   663  	}()
   664  
   665  	return readCheckpoint(file, logger)
   666  }
   667  
   668  func readCheckpoint(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) {
   669  
   670  	// Read header: magic (2 bytes) + version (2 bytes)
   671  	header := make([]byte, headerSize)
   672  	_, err := io.ReadFull(f, header)
   673  	if err != nil {
   674  		return nil, fmt.Errorf("cannot read header: %w", err)
   675  	}
   676  
   677  	// Decode header
   678  	magicBytes := binary.BigEndian.Uint16(header)
   679  	version := binary.BigEndian.Uint16(header[encMagicSize:])
   680  
   681  	// Reset offset
   682  	_, err = f.Seek(0, io.SeekStart)
   683  	if err != nil {
   684  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   685  	}
   686  
   687  	if magicBytes != MagicBytesCheckpointHeader {
   688  		return nil, fmt.Errorf("unknown file format. Magic constant %x does not match expected %x", magicBytes, MagicBytesCheckpointHeader)
   689  	}
   690  
   691  	switch version {
   692  	case VersionV1, VersionV3:
   693  		return readCheckpointV3AndEarlier(f, version)
   694  	case VersionV4:
   695  		return readCheckpointV4(f)
   696  	case VersionV5:
   697  		return readCheckpointV5(f, logger)
   698  	case VersionV6:
   699  		return readCheckpointV6(f, logger)
   700  	default:
   701  		return nil, fmt.Errorf("unsupported file version %x", version)
   702  	}
   703  }
   704  
   705  type nodeWithRegMetrics struct {
   706  	n        *node.Node
   707  	regCount uint64
   708  	regSize  uint64
   709  }
   710  
   711  // readCheckpointV3AndEarlier deserializes checkpoint file (version 3 and earlier) and returns a list of tries.
   712  // Header (magic and version) is verified by the caller.
   713  // This function is for backwards compatibility, not optimized.
   714  func readCheckpointV3AndEarlier(f *os.File, version uint16) ([]*trie.MTrie, error) {
   715  
   716  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   717  	crcReader := NewCRC32Reader(bufReader)
   718  
   719  	var reader io.Reader
   720  
   721  	if version != VersionV3 {
   722  		reader = bufReader
   723  	} else {
   724  		reader = crcReader
   725  	}
   726  
   727  	// Read header (magic + version), node count, and trie count.
   728  	header := make([]byte, headerSize+encNodeCountSize+encTrieCountSize)
   729  
   730  	_, err := io.ReadFull(reader, header)
   731  	if err != nil {
   732  		return nil, fmt.Errorf("cannot read header: %w", err)
   733  	}
   734  
   735  	// Magic and version are verified by the caller.
   736  
   737  	// Decode node count and trie count
   738  	nodesCount := binary.BigEndian.Uint64(header[headerSize:])
   739  	triesCount := binary.BigEndian.Uint16(header[headerSize+encNodeCountSize:])
   740  
   741  	nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil
   742  	tries := make([]*trie.MTrie, triesCount)
   743  
   744  	for i := uint64(1); i <= nodesCount; i++ {
   745  		n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   746  			if nodeIndex >= uint64(i) {
   747  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship")
   748  			}
   749  			nm := nodes[nodeIndex]
   750  			return nm.n, nm.regCount, nm.regSize, nil
   751  		})
   752  		if err != nil {
   753  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   754  		}
   755  		nodes[i].n = n
   756  		nodes[i].regCount = regCount
   757  		nodes[i].regSize = regSize
   758  	}
   759  
   760  	for i := uint16(0); i < triesCount; i++ {
   761  		trie, err := flattener.ReadTrieFromCheckpointV3AndEarlier(reader, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   762  			if nodeIndex >= uint64(len(nodes)) {
   763  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node")
   764  			}
   765  			nm := nodes[nodeIndex]
   766  			return nm.n, nm.regCount, nm.regSize, nil
   767  		})
   768  		if err != nil {
   769  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   770  		}
   771  		tries[i] = trie
   772  	}
   773  
   774  	if version == VersionV3 {
   775  		crc32buf := make([]byte, crc32SumSize)
   776  
   777  		_, err := io.ReadFull(bufReader, crc32buf)
   778  		if err != nil {
   779  			return nil, fmt.Errorf("cannot read CRC32: %w", err)
   780  		}
   781  
   782  		readCrc32 := binary.BigEndian.Uint32(crc32buf)
   783  
   784  		calculatedCrc32 := crcReader.Crc32()
   785  
   786  		if calculatedCrc32 != readCrc32 {
   787  			return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
   788  		}
   789  	}
   790  
   791  	return tries, nil
   792  }
   793  
   794  // readCheckpointV4 decodes checkpoint file (version 4) and returns a list of tries.
   795  // Header (magic and version) is verified by the caller.
   796  // This function is for backwards compatibility.
   797  func readCheckpointV4(f *os.File) ([]*trie.MTrie, error) {
   798  
   799  	// Scratch buffer is used as temporary buffer that reader can read into.
   800  	// Raw data in scratch buffer should be copied or converted into desired
   801  	// objects before next Read operation.  If the scratch buffer isn't large
   802  	// enough, a new buffer will be allocated.  However, 4096 bytes will
   803  	// be large enough to handle almost all payloads and 100% of interim nodes.
   804  	scratch := make([]byte, 1024*4) // must not be less than 1024
   805  
   806  	// Read footer to get node count and trie count
   807  
   808  	// footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes)
   809  	const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize
   810  	const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum
   811  
   812  	// Seek to footer
   813  	_, err := f.Seek(-footerOffset, io.SeekEnd)
   814  	if err != nil {
   815  		return nil, fmt.Errorf("cannot seek to footer: %w", err)
   816  	}
   817  
   818  	footer := scratch[:footerSize]
   819  
   820  	_, err = io.ReadFull(f, footer)
   821  	if err != nil {
   822  		return nil, fmt.Errorf("cannot read footer: %w", err)
   823  	}
   824  
   825  	// Decode node count and trie count
   826  	nodesCount := binary.BigEndian.Uint64(footer)
   827  	triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:])
   828  
   829  	// Seek to the start of file
   830  	_, err = f.Seek(0, io.SeekStart)
   831  	if err != nil {
   832  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   833  	}
   834  
   835  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   836  	crcReader := NewCRC32Reader(bufReader)
   837  	var reader io.Reader = crcReader
   838  
   839  	// Read header: magic (2 bytes) + version (2 bytes)
   840  	// No action is needed for header because it is verified by the caller.
   841  
   842  	_, err = io.ReadFull(reader, scratch[:headerSize])
   843  	if err != nil {
   844  		return nil, fmt.Errorf("cannot read header: %w", err)
   845  	}
   846  
   847  	// nodes's element at index 0 is a special, meaning nil .
   848  	nodes := make([]nodeWithRegMetrics, nodesCount+1) //+1 for 0 index meaning nil
   849  	tries := make([]*trie.MTrie, triesCount)
   850  
   851  	for i := uint64(1); i <= nodesCount; i++ {
   852  		n, regCount, regSize, err := flattener.ReadNodeFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   853  			if nodeIndex >= uint64(i) {
   854  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes does not satisfy Descendents-First-Relationship")
   855  			}
   856  			nm := nodes[nodeIndex]
   857  			return nm.n, nm.regCount, nm.regSize, nil
   858  		})
   859  		if err != nil {
   860  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   861  		}
   862  		nodes[i].n = n
   863  		nodes[i].regCount = regCount
   864  		nodes[i].regSize = regSize
   865  	}
   866  
   867  	for i := uint16(0); i < triesCount; i++ {
   868  		trie, err := flattener.ReadTrieFromCheckpointV4(reader, scratch, func(nodeIndex uint64) (*node.Node, uint64, uint64, error) {
   869  			if nodeIndex >= uint64(len(nodes)) {
   870  				return nil, 0, 0, fmt.Errorf("sequence of stored nodes doesn't contain node")
   871  			}
   872  			nm := nodes[nodeIndex]
   873  			return nm.n, nm.regCount, nm.regSize, nil
   874  		})
   875  		if err != nil {
   876  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   877  		}
   878  		tries[i] = trie
   879  	}
   880  
   881  	// Read footer again for crc32 computation
   882  	// No action is needed.
   883  	_, err = io.ReadFull(reader, footer)
   884  	if err != nil {
   885  		return nil, fmt.Errorf("cannot read footer: %w", err)
   886  	}
   887  
   888  	// Read CRC32
   889  	crc32buf := scratch[:crc32SumSize]
   890  	_, err = io.ReadFull(bufReader, crc32buf)
   891  	if err != nil {
   892  		return nil, fmt.Errorf("cannot read CRC32: %w", err)
   893  	}
   894  
   895  	readCrc32 := binary.BigEndian.Uint32(crc32buf)
   896  
   897  	calculatedCrc32 := crcReader.Crc32()
   898  
   899  	if calculatedCrc32 != readCrc32 {
   900  		return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
   901  	}
   902  
   903  	return tries, nil
   904  }
   905  
   906  // readCheckpointV5 decodes checkpoint file (version 5) and returns a list of tries.
   907  // Checkpoint file header (magic and version) are verified by the caller.
   908  func readCheckpointV5(f *os.File, logger zerolog.Logger) ([]*trie.MTrie, error) {
   909  	logger.Info().Msgf("reading v5 checkpoint file")
   910  
   911  	// Scratch buffer is used as temporary buffer that reader can read into.
   912  	// Raw data in scratch buffer should be copied or converted into desired
   913  	// objects before next Read operation.  If the scratch buffer isn't large
   914  	// enough, a new buffer will be allocated.  However, 4096 bytes will
   915  	// be large enough to handle almost all payloads and 100% of interim nodes.
   916  	scratch := make([]byte, 1024*4) // must not be less than 1024
   917  
   918  	// Read footer to get node count and trie count
   919  
   920  	// footer offset: nodes count (8 bytes) + tries count (2 bytes) + CRC32 sum (4 bytes)
   921  	const footerOffset = encNodeCountSize + encTrieCountSize + crc32SumSize
   922  	const footerSize = encNodeCountSize + encTrieCountSize // footer doesn't include crc32 sum
   923  
   924  	// Seek to footer
   925  	_, err := f.Seek(-footerOffset, io.SeekEnd)
   926  	if err != nil {
   927  		return nil, fmt.Errorf("cannot seek to footer: %w", err)
   928  	}
   929  
   930  	footer := scratch[:footerSize]
   931  
   932  	_, err = io.ReadFull(f, footer)
   933  	if err != nil {
   934  		return nil, fmt.Errorf("cannot read footer: %w", err)
   935  	}
   936  
   937  	// Decode node count and trie count
   938  	nodesCount := binary.BigEndian.Uint64(footer)
   939  	triesCount := binary.BigEndian.Uint16(footer[encNodeCountSize:])
   940  
   941  	// Seek to the start of file
   942  	_, err = f.Seek(0, io.SeekStart)
   943  	if err != nil {
   944  		return nil, fmt.Errorf("cannot seek to start of file: %w", err)
   945  	}
   946  
   947  	var bufReader io.Reader = bufio.NewReaderSize(f, defaultBufioReadSize)
   948  	crcReader := NewCRC32Reader(bufReader)
   949  	var reader io.Reader = crcReader
   950  
   951  	// Read header: magic (2 bytes) + version (2 bytes)
   952  	// No action is needed for header because it is verified by the caller.
   953  
   954  	_, err = io.ReadFull(reader, scratch[:headerSize])
   955  	if err != nil {
   956  		return nil, fmt.Errorf("cannot read header: %w", err)
   957  	}
   958  
   959  	// nodes's element at index 0 is a special, meaning nil .
   960  	nodes := make([]*node.Node, nodesCount+1) //+1 for 0 index meaning nil
   961  	tries := make([]*trie.MTrie, triesCount)
   962  
   963  	logging := logProgress("reading trie nodes", int(nodesCount), logger)
   964  
   965  	for i := uint64(1); i <= nodesCount; i++ {
   966  		n, err := flattener.ReadNode(reader, scratch, func(nodeIndex uint64) (*node.Node, error) {
   967  			if nodeIndex >= uint64(i) {
   968  				return nil, fmt.Errorf("sequence of serialized nodes does not satisfy Descendents-First-Relationship")
   969  			}
   970  			return nodes[nodeIndex], nil
   971  		})
   972  		if err != nil {
   973  			return nil, fmt.Errorf("cannot read node %d: %w", i, err)
   974  		}
   975  		nodes[i] = n
   976  		logging(i)
   977  	}
   978  
   979  	logger.Info().Msgf("finished loading %v trie nodes, start loading %v tries", nodesCount, triesCount)
   980  
   981  	for i := uint16(0); i < triesCount; i++ {
   982  		trie, err := flattener.ReadTrie(reader, scratch, func(nodeIndex uint64) (*node.Node, error) {
   983  			if nodeIndex >= uint64(len(nodes)) {
   984  				return nil, fmt.Errorf("sequence of stored nodes doesn't contain node")
   985  			}
   986  			return nodes[nodeIndex], nil
   987  		})
   988  		if err != nil {
   989  			return nil, fmt.Errorf("cannot read trie %d: %w", i, err)
   990  		}
   991  		tries[i] = trie
   992  	}
   993  
   994  	// Read footer again for crc32 computation
   995  	// No action is needed.
   996  	_, err = io.ReadFull(reader, footer)
   997  	if err != nil {
   998  		return nil, fmt.Errorf("cannot read footer: %w", err)
   999  	}
  1000  
  1001  	// Read CRC32
  1002  	crc32buf := scratch[:crc32SumSize]
  1003  	_, err = io.ReadFull(bufReader, crc32buf)
  1004  	if err != nil {
  1005  		return nil, fmt.Errorf("cannot read CRC32: %w", err)
  1006  	}
  1007  
  1008  	readCrc32 := binary.BigEndian.Uint32(crc32buf)
  1009  
  1010  	calculatedCrc32 := crcReader.Crc32()
  1011  
  1012  	if calculatedCrc32 != readCrc32 {
  1013  		return nil, fmt.Errorf("checkpoint checksum failed! File contains %x but calculated crc32 is %x", readCrc32, calculatedCrc32)
  1014  	}
  1015  
  1016  	return tries, nil
  1017  }
  1018  
  1019  // evictFileFromLinuxPageCache advises Linux to evict a file from Linux page cache.
  1020  // A use case is when a new checkpoint is loaded or created, Linux may cache big
  1021  // checkpoint files in memory until evictFileFromLinuxPageCache causes them to be
  1022  // evicted from the Linux page cache.  Not calling eviceFileFromLinuxPageCache()
  1023  // causes two checkpoint files to be cached for each checkpointing, eventually
  1024  // caching hundreds of GB.
  1025  // CAUTION: no-op when GOOS != linux.
  1026  func evictFileFromLinuxPageCache(f *os.File, fsync bool, logger zerolog.Logger) error {
  1027  	err := fadviseNoLinuxPageCache(f.Fd(), fsync)
  1028  	if err != nil {
  1029  		return err
  1030  	}
  1031  
  1032  	size := int64(0)
  1033  	fstat, err := f.Stat()
  1034  	if err == nil {
  1035  		size = fstat.Size()
  1036  	}
  1037  
  1038  	logger.Info().Str("filename", f.Name()).Int64("size_mb", size/1024/1024).Msg("evicted file from Linux page cache")
  1039  	return nil
  1040  }
  1041  
  1042  // Copy the checkpoint file including the part files from the given `from` to
  1043  // the `to` directory
  1044  // it returns the path of all the copied files
  1045  // any error returned are exceptions
  1046  func CopyCheckpointFile(filename string, from string, to string) (
  1047  	[]string,
  1048  	error,
  1049  ) {
  1050  	// It's possible that the trie dir does not yet exist. If not this will create the the required path
  1051  	err := os.MkdirAll(to, 0700)
  1052  	if err != nil {
  1053  		return nil, err
  1054  	}
  1055  
  1056  	// checkpoint V6 produces multiple checkpoint part files that need to be copied over
  1057  	pattern := filePathPattern(from, filename)
  1058  	matched, err := filepath.Glob(pattern)
  1059  	if err != nil {
  1060  		return nil, fmt.Errorf("could not glob checkpoint file with pattern %v: %w", pattern, err)
  1061  	}
  1062  
  1063  	newPaths := make([]string, len(matched))
  1064  	// copy the root checkpoint concurrently
  1065  	var group errgroup.Group
  1066  
  1067  	for i, match := range matched {
  1068  		_, partfile := filepath.Split(match)
  1069  		newPath := filepath.Join(to, partfile)
  1070  		newPaths[i] = newPath
  1071  
  1072  		match := match
  1073  		group.Go(func() error {
  1074  			err := utilsio.Copy(match, newPath)
  1075  			if err != nil {
  1076  				return fmt.Errorf("cannot copy file from %v to %v", match, newPath)
  1077  			}
  1078  			return nil
  1079  		})
  1080  	}
  1081  
  1082  	err = group.Wait()
  1083  	if err != nil {
  1084  		return nil, fmt.Errorf("fail to copy checkpoint files: %w", err)
  1085  	}
  1086  
  1087  	return newPaths, nil
  1088  }