github.com/MetalBlockchain/subnet-evm@v0.4.9/sync/statesync/trie_segments.go (about)

     1  // (c) 2021-2022, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package statesync
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"fmt"
    10  	"sync"
    11  
    12  	"github.com/MetalBlockchain/metalgo/utils/wrappers"
    13  	"github.com/MetalBlockchain/subnet-evm/core/rawdb"
    14  	"github.com/MetalBlockchain/subnet-evm/ethdb"
    15  	syncclient "github.com/MetalBlockchain/subnet-evm/sync/client"
    16  	"github.com/MetalBlockchain/subnet-evm/trie"
    17  	"github.com/MetalBlockchain/subnet-evm/utils"
    18  	"github.com/ethereum/go-ethereum/common"
    19  	"github.com/ethereum/go-ethereum/log"
    20  )
    21  
    22  var (
    23  	_ syncclient.LeafSyncTask = &trieSegment{}
    24  	_ fmt.Stringer            = &trieSegment{}
    25  )
    26  
    27  // trieToSync keeps the state of a single trie syncing
    28  // this can be a storage or the main trie.
    29  type trieToSync struct {
    30  	root    common.Hash
    31  	account common.Hash
    32  
    33  	// The trie consists of a slice of segments. each
    34  	// segment has a start and end range of keys, and
    35  	// contains a pointer back to this struct.
    36  	segments []*trieSegment
    37  
    38  	// These fields are used to hash the segments in
    39  	// order, even though they may finish syncing out
    40  	// of order or concurrently.
    41  	lock              sync.Mutex
    42  	segmentsDone      map[int]struct{}
    43  	segmentToHashNext int
    44  
    45  	// We use a stack trie to hash the leafs and have
    46  	// a batch used for writing it to disk.
    47  	batch     ethdb.Batch
    48  	stackTrie *trie.StackTrie
    49  
    50  	// We keep a pointer to the overall sync operation,
    51  	// used to add segments to the work queue and to
    52  	// update the eta.
    53  	sync *stateSync
    54  
    55  	// task implements the syncTask interface with methods
    56  	// containing logic specific to the main trie or storage
    57  	// tries.
    58  	task       syncTask
    59  	isMainTrie bool
    60  }
    61  
    62  // NewTrieToSync initializes a trieToSync and restores any previously started segments.
    63  func NewTrieToSync(sync *stateSync, root common.Hash, account common.Hash, syncTask syncTask) (*trieToSync, error) {
    64  	batch := sync.db.NewBatch()
    65  	trieToSync := &trieToSync{
    66  		sync:         sync,
    67  		root:         root,
    68  		account:      account,
    69  		batch:        batch,
    70  		stackTrie:    trie.NewStackTrie(batch),
    71  		isMainTrie:   (root == sync.root),
    72  		task:         syncTask,
    73  		segmentsDone: make(map[int]struct{}),
    74  	}
    75  	return trieToSync, trieToSync.loadSegments()
    76  }
    77  
    78  // loadSegments reads persistent storage and initializes trieSegments that
    79  // had been previously started and need to be resumed.
    80  func (t *trieToSync) loadSegments() error {
    81  	// Get an iterator for segments for t.root and see if we find anything.
    82  	// This lets us check if this trie was previously segmented, in which
    83  	// case we need to restore the same segments on resume.
    84  	it := rawdb.NewSyncSegmentsIterator(t.sync.db, t.root)
    85  	defer it.Release()
    86  
    87  	// Track the previously added segment as we loop over persisted values.
    88  	var prevSegmentStart []byte
    89  
    90  	for it.Next() {
    91  		// If we find any persisted segments with the specified
    92  		// prefix, we add a new segment to the trie here.
    93  		// The segment we add represents a segment ending at the
    94  		// key immediately prior to the segment we found on disk.
    95  		// This is because we do not persist the beginning of
    96  		// the first segment.
    97  		_, segmentStart := rawdb.UnpackSyncSegmentKey(it.Key())
    98  		segmentStartPos := binary.BigEndian.Uint16(segmentStart[:wrappers.ShortLen])
    99  		t.addSegment(prevSegmentStart, addPadding(segmentStartPos-1, 0xff))
   100  
   101  		// keep tracking the previous segment
   102  		prevSegmentStart = segmentStart
   103  	}
   104  	if err := it.Error(); err != nil {
   105  		return err
   106  	}
   107  
   108  	// this creates the last segment if any were found in the loop
   109  	// and also handles the case where there were no segments persisted to disk.
   110  	t.addSegment(prevSegmentStart, nil)
   111  
   112  	for _, segment := range t.segments {
   113  		// for each segment we need to find the last key already persisted
   114  		// so syncing can begin at the subsequent key
   115  		var lastKey []byte
   116  		it := segment.trie.task.IterateLeafs(common.BytesToHash(segment.start))
   117  		defer it.Release()
   118  		for it.Next() {
   119  			if len(segment.end) > 0 && bytes.Compare(it.Key(), segment.end) > 0 {
   120  				// don't go past the end of the segment
   121  				break
   122  			}
   123  			lastKey = common.CopyBytes(it.Key())
   124  			segment.leafs++
   125  		}
   126  		if lastKey != nil {
   127  			utils.IncrOne(lastKey)
   128  			segment.pos = lastKey // syncing will start from this key
   129  		}
   130  		log.Debug("statesync: loading segment", "segment", segment)
   131  	}
   132  	return it.Error()
   133  }
   134  
   135  // startSyncing adds the trieToSync's segments to the work queue
   136  func (t *trieToSync) startSyncing() {
   137  	for _, segment := range t.segments {
   138  		t.sync.segments <- segment // this will queue the segment for syncing
   139  	}
   140  }
   141  
   142  // addSegment appends a newly created segment specified by [start] and
   143  // [end] to [t.segments] and returns it.
   144  // note: addSegment does not take a lock and therefore is called only
   145  // before multiple segments are syncing concurrently.
   146  func (t *trieToSync) addSegment(start, end []byte) *trieSegment {
   147  	segment := &trieSegment{
   148  		start: start,
   149  		end:   end,
   150  		trie:  t,
   151  		idx:   len(t.segments),
   152  		batch: t.sync.db.NewBatch(),
   153  	}
   154  	t.segments = append(t.segments, segment)
   155  	return segment
   156  }
   157  
   158  // segmentFinished is called when one the trie segment with index [idx] finishes syncing.
   159  // creates intermediary hash nodes for the trie up to the last contiguous segment received from start.
   160  func (t *trieToSync) segmentFinished(idx int) error {
   161  	t.lock.Lock()
   162  	defer t.lock.Unlock()
   163  
   164  	log.Debug("statesync: segment finished", "segment", t.segments[idx])
   165  	t.segmentsDone[idx] = struct{}{}
   166  	for {
   167  		if _, ok := t.segmentsDone[t.segmentToHashNext]; !ok {
   168  			// if not the next contiguous segment from the beginning of the trie
   169  			// don't do anything.
   170  			break
   171  		}
   172  		segment := t.segments[t.segmentToHashNext]
   173  
   174  		// persist any items in the batch as they will be iterated below.
   175  		if err := segment.batch.Write(); err != nil {
   176  			return err
   177  		}
   178  		segment.batch.Reset() // reset the batch to free memory (even though it is no longer used)
   179  
   180  		// iterate all the items from the start of the segment (end is checked in the loop)
   181  		it := t.task.IterateLeafs(common.BytesToHash(segment.start))
   182  		defer it.Release()
   183  
   184  		for it.Next() {
   185  			if len(segment.end) > 0 && bytes.Compare(it.Key(), segment.end) > 0 {
   186  				// don't go past the end of the segment. (data belongs to the next segment)
   187  				break
   188  			}
   189  			// update the stack trie and cap the batch it writes to.
   190  			value := common.CopyBytes(it.Value())
   191  			if err := t.stackTrie.TryUpdate(it.Key(), value); err != nil {
   192  				return err
   193  			}
   194  			if t.batch.ValueSize() > t.sync.batchSize {
   195  				if err := t.batch.Write(); err != nil {
   196  					return err
   197  				}
   198  				t.batch.Reset()
   199  			}
   200  		}
   201  		if err := it.Error(); err != nil {
   202  			return err
   203  		}
   204  		t.segmentToHashNext++
   205  	}
   206  	if t.segmentToHashNext < len(t.segments) {
   207  		// trie not complete
   208  		return nil
   209  	}
   210  
   211  	// when the trie is finished, this hashes any remaining nodes in the stack
   212  	// trie and creates the root
   213  	actualRoot, err := t.stackTrie.Commit()
   214  	if err != nil {
   215  		return err
   216  	}
   217  	if actualRoot != t.root {
   218  		return fmt.Errorf("unexpected root, expected=%s, actual=%s, account=%s", t.root, actualRoot, t.account)
   219  	}
   220  	if !t.isMainTrie {
   221  		// the batch containing the main trie's root will be committed on
   222  		// sync completion.
   223  		if err := t.batch.Write(); err != nil {
   224  			return err
   225  		}
   226  	}
   227  
   228  	// remove all segments for this root from persistent storage
   229  	if err := rawdb.ClearSyncSegments(t.sync.db, t.root); err != nil {
   230  		return err
   231  	}
   232  	return t.task.OnFinish()
   233  }
   234  
   235  // createSegmentsIfNeeded is called from the leaf handler. In case the trie syncing only has
   236  // one segment but a large number of leafs ([t.estimateSize() > segmentThreshold], it will
   237  // create [numSegments-1] additional segments to sync the trie.
   238  func (t *trieToSync) createSegmentsIfNeeded(numSegments int) error {
   239  	if !t.shouldSegment() {
   240  		return nil
   241  	}
   242  
   243  	return t.createSegments(numSegments)
   244  }
   245  
   246  // shouldSegment returns true if a trie should be separated into segments.
   247  func (t *trieToSync) shouldSegment() bool {
   248  	t.lock.Lock()
   249  	defer t.lock.Unlock()
   250  
   251  	// Return false if the trie has already been segmented.
   252  	if len(t.segments) > 1 {
   253  		return false
   254  	}
   255  
   256  	// Return true iff the estimated size of the trie exceeds [segmentThreshold].
   257  	// Note: at this point there is only a single segment (loadSegments guarantees there
   258  	// is at least one segment).
   259  	segment := t.segments[0]
   260  	return segment.estimateSize() >= uint64(segmentThreshold)
   261  }
   262  
   263  // divide the key space into [numSegments] consecutive segments.
   264  // we use 2 bytes to build the ranges and fill the rest with
   265  // ones or zeroes accordingly.
   266  // this represents the step between the first 2 bytes of the start
   267  // key of consecutive segments.
   268  // createSegments should only be called once when there is only one
   269  // thread accessing this trie, such that there is no need to hold a lock.
   270  func (t *trieToSync) createSegments(numSegments int) error {
   271  	segment := t.segments[0]
   272  
   273  	segmentStep := 0x10000 / numSegments
   274  
   275  	for i := 0; i < numSegments; i++ {
   276  		start := uint16(i * segmentStep)
   277  		end := uint16(i*segmentStep + (segmentStep - 1))
   278  
   279  		startBytes := addPadding(start, 0x00)
   280  		endBytes := addPadding(end, 0xff)
   281  
   282  		// Skip any portion of the trie that has already been synced.
   283  		if bytes.Compare(segment.pos, endBytes) >= 0 {
   284  			continue
   285  		}
   286  
   287  		// since the first segment is already syncing,
   288  		// it does not need to be added to the task queue.
   289  		// instead, we update its end and move on to creating
   290  		// the next segment
   291  		if segment.end == nil {
   292  			segment.end = endBytes
   293  			continue
   294  		}
   295  
   296  		// create the segments
   297  		segment := t.addSegment(startBytes, endBytes)
   298  		if err := rawdb.WriteSyncSegment(t.sync.db, t.root, segment.start); err != nil {
   299  			return err
   300  		}
   301  	}
   302  	// add the newly created segments to the task queue
   303  	// after creating them. We skip the first one, as it
   304  	// is already syncing.
   305  	// this avoids concurrent access to [t.segments].
   306  	for i := 1; i < len(t.segments); i++ {
   307  		t.sync.segments <- t.segments[i]
   308  	}
   309  	t.sync.stats.incTriesSegmented()
   310  	log.Debug("statesync: trie segmented for parallel sync", "root", t.root, "account", t.account, "segments", len(t.segments))
   311  	return nil
   312  }
   313  
   314  // trieSegment keeps the state of syncing one segment of a [trieToSync]
   315  // struct and keeps a pointer to the [trieToSync] it is syncing.
   316  // each trieSegment is accessed by its own goroutine, so locks are not
   317  // needed to access its fields
   318  type trieSegment struct {
   319  	start []byte
   320  	pos   []byte
   321  	end   []byte
   322  
   323  	trie  *trieToSync // points back to the trie the segment belongs to
   324  	idx   int         // index of this segment in the trie's segment slice
   325  	batch ethdb.Batch // batch for writing leafs to
   326  	leafs uint64      // number of leafs added to the segment
   327  }
   328  
   329  func (t *trieSegment) String() string {
   330  	return fmt.Sprintf(
   331  		"[%s](%d/%d) (start=%s,end=%s)",
   332  		t.trie.root, t.idx+1, len(t.trie.segments),
   333  		common.BytesToHash(t.start).TerminalString(),
   334  		common.BytesToHash(t.end).TerminalString(),
   335  	)
   336  }
   337  
   338  // these functions implement the LeafSyncTask interface.
   339  func (t *trieSegment) Root() common.Hash      { return t.trie.root }
   340  func (t *trieSegment) Account() common.Hash   { return t.trie.account }
   341  func (t *trieSegment) End() []byte            { return t.end }
   342  func (t *trieSegment) OnStart() (bool, error) { return t.trie.task.OnStart() }
   343  func (t *trieSegment) OnFinish() error        { return t.trie.segmentFinished(t.idx) }
   344  
   345  func (t *trieSegment) Start() []byte {
   346  	if t.pos != nil {
   347  		return t.pos
   348  	}
   349  	return t.start
   350  }
   351  
   352  func (t *trieSegment) OnLeafs(keys, vals [][]byte) error {
   353  	// invoke the onLeafs callback
   354  	if err := t.trie.task.OnLeafs(t.batch, keys, vals); err != nil {
   355  		return err
   356  	}
   357  	// cap the segment's batch
   358  	if t.batch.ValueSize() > t.trie.sync.batchSize {
   359  		if err := t.batch.Write(); err != nil {
   360  			return err
   361  		}
   362  		t.batch.Reset()
   363  	}
   364  	t.leafs += uint64(len(keys))
   365  	if len(keys) > 0 {
   366  		t.pos = keys[len(keys)-1] // remember the position, used in estimating trie size
   367  		utils.IncrOne(t.pos)
   368  	}
   369  
   370  	// update eta
   371  	t.trie.sync.stats.incLeafs(t, uint64(len(keys)), t.estimateSize())
   372  
   373  	if t.trie.root == t.trie.sync.root {
   374  		return t.trie.createSegmentsIfNeeded(numMainTrieSegments)
   375  	} else {
   376  		return t.trie.createSegmentsIfNeeded(numStorageTrieSegments)
   377  	}
   378  }
   379  
   380  // estimateSize calculates an estimate of the number of leafs and returns it,
   381  // this assumes the trie has uniform key density.
   382  // Note: returns 0 if there has been no progress in syncing the trie.
   383  func (t *trieSegment) estimateSize() uint64 {
   384  	start, pos, end := uint16(0), uint16(0), uint16(0xffff)
   385  	if len(t.start) > 0 {
   386  		start = binary.BigEndian.Uint16(t.start)
   387  	}
   388  	if len(t.pos) > 0 {
   389  		pos = binary.BigEndian.Uint16(t.pos)
   390  	}
   391  	if len(t.end) > 0 {
   392  		end = binary.BigEndian.Uint16(t.end)
   393  	}
   394  	progress := pos - start
   395  	if progress == 0 {
   396  		// this should not occur since estimateSize is called after processing
   397  		// a batch of leafs, which sets [pos].
   398  		// avoid division by 0 out of caution.
   399  		return 0
   400  	}
   401  	left := end - pos
   402  	return t.leafs * uint64(left) / uint64(progress)
   403  }
   404  
   405  // addPadding returns a []byte of length [common.Hash], starting with the BigEndian
   406  // representation of [pos], and the rest filled with [padding].
   407  func addPadding(pos uint16, padding byte) []byte {
   408  	packer := wrappers.Packer{Bytes: make([]byte, common.HashLength)}
   409  	packer.PackShort(pos)
   410  	packer.PackFixedBytes(bytes.Repeat([]byte{padding}, common.HashLength-wrappers.ShortLen))
   411  	return packer.Bytes
   412  }